20CN网络安全小组 - linux kernel 2.4.5 ipv4 socket层的一点解释

您当前的位置 >> 首页

linux kernel 2.4.5 ipv4 socket层的一点解释

/ns/wz/sys/data/20010626104710.htm

linux kernel 2.4.5 ipv4 socket层的一点解释
by scancat <scancat_x86@etang.com>

声明：本人水平有限，所以弄的不好，不要骂我啊：）
1.新建socket
函数原形：
static int inet_create(struct socket *sock, int protocol)
在net/ipv4/af_inet.c中
详细解释
static int inet_create(struct socket *sock, int protocol)
{
struct sock *sk;
struct proto *prot;
sock->state = SS_UNCONNECTED;　/* 设置状态为未连接 */
sk = sk_alloc(PF_INET, GFP_KERNEL, 1); /* 申请sock所需的内存 */
　　　/* net/core/sock.c */
if (sk == NULL)
　goto do_oom;
switch (sock->type) {
case SOCK_STREAM:　 /* TCP协议 */
　if (protocol && protocol != IPPROTO_TCP)
　 goto free_and_noproto;
　protocol = IPPROTO_TCP;
　prot = &tcp_prot;　/* tcp_prot定义在net/ipv4/tcp_ipv4.c */
　sock->ops = &inet_stream_ops; /* 针对STREAM的socket操作 */
　break;
case SOCK_SEQPACKET:　 /* 不支持 */
　goto free_and_badtype;
case SOCK_DGRAM:　 /* UDP协议 */
　if (protocol && protocol != IPPROTO_UDP)
　 goto free_and_noproto;
　protocol = IPPROTO_UDP;
　sk->no_check = UDP_CSUM_DEFAULT;
　prot=&udp_prot;　 /* udp_prot定义在net/ipv4/udp.c */
　sock->ops = &inet_dgram_ops; /* 针对DGRAM的socket操作 */
　break;
case SOCK_RAW:　　/* RAW */
　if (!capable(CAP_NET_RAW)) /* 判断是否有权利建立SOCK_RAW */
　 goto free_and_badperm;
　if (!protocol)　 /* protocol不能为0 */
　 goto free_and_noproto;
　prot = &raw_prot;　/* raw_prot定义在net/ipv4/raw.c */
　sk->reuse = 1;　 /* 允许地址重用 */
　sk->num = protocol;
　sock->ops = &inet_dgram_ops; /* RAW的一些特性和DGRAM相同 */
　if (protocol == IPPROTO_RAW)
　 sk->protinfo.af_inet.hdrincl = 1;
　　　/* 允许自己定制ip头 */
　break;
default:
　goto free_and_badtype;
}
if (ipv4_config.no_pmtu_disc)
　sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
else
　sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT;
sk->protinfo.af_inet.id = 0;
sock_init_data(sock,sk);　/* 初始化一些数据 */
　　　/* net/core/sock.c */
sk->destruct = inet_sock_destruct; /* 当销毁socket时调用inet_sock_destruct */
sk->zapped = 0;
sk->family = PF_INET;
sk->protocol = protocol;
sk->prot = prot;
sk->backlog_rcv = prot->backlog_rcv; /* prot->backlog_rcv()见各个类型的定义 */
sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl; /* 设置默认ttl */
　　　/* 修改/proc/sys/net/ipv4/ip_default_ttl */
sk->protinfo.af_inet.mc_loop = 1;
sk->protinfo.af_inet.mc_ttl = 1;
sk->protinfo.af_inet.mc_index = 0;
sk->protinfo.af_inet.mc_list = NULL;
#ifdef INET_REFCNT_DEBUG
atomic_inc(&inet_sock_nr);
#endif
if (sk->num) {
　/* It assumes that any protocol which allows
　 * the user to assign a number at socket
　 * creation time automatically
　 * shares.
　 */
　sk->sport = htons(sk->num); /* 设置本地端口 */
　/* Add to protocol hash chains. */
　sk->prot->hash(sk);
}
if (sk->prot->init) {
　int err = sk->prot->init(sk); /* 协议对socket的初始化 */
　if (err != 0) {
　 inet_sock_release(sk);
　 return(err);
　}
}
return(0);
free_and_badtype:
sk_free(sk);　　/* 释放内存 */
return -ESOCKTNOSUPPORT;
free_and_badperm:
sk_free(sk);
return -EPERM;
free_and_noproto:
sk_free(sk);
return -EPROTONOSUPPORT;
do_oom:
return -ENOBUFS;
}
在net/core/sock.c
void sock_init_data(struct socket *sock, struct sock *sk)
{
skb_queue_head_init(&sk->receive_queue); /* 初始化3条队列接受，发送，错误*/
skb_queue_head_init(&sk->write_queue);
skb_queue_head_init(&sk->error_queue);
init_timer(&sk->timer);　　/* 初始化timer */

sk->allocation = GFP_KERNEL;
sk->rcvbuf = sysctl_rmem_default;
sk->sndbuf = sysctl_wmem_default;
sk->state = TCP_CLOSE;
sk->zapped = 1;
sk->socket = sock;
if(sock)
{
　sk->type = sock->type;
　sk->sleep = &sock->wait;
　sock->sk = sk;
} else
　sk->sleep = NULL;
sk->dst_lock　= RW_LOCK_UNLOCKED;
sk->callback_lock = RW_LOCK_UNLOCKED;
　　 /* sock_def_wakeup(),sock_def_readable(),
　　　 sock_def_write_space(),sock_def_error_report(),
　　　　sock_def_destruct() 在net/core/sock.c */
sk->state_change = sock_def_wakeup;
sk->data_ready　= sock_def_readable;
sk->write_space　= sock_def_write_space;
sk->error_report = sock_def_error_report;
sk->destruct　　　　　 =　　　 sock_def_destruct;
sk->peercred.pid = 0;
sk->peercred.uid = -1;
sk->peercred.gid = -1;
sk->rcvlowat　= 1;
sk->rcvtimeo　= MAX_SCHEDULE_TIMEOUT; /* 设置接受，发送超时 */
sk->sndtimeo　= MAX_SCHEDULE_TIMEOUT;
atomic_set(&sk->refcnt, 1);
}
1.1 SOCK_STREAM的初始化
在net/ipv4/tcp_ipv4.c
static int tcp_v4_init_sock(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
tp->rto　= TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT;
　　　
/* So many TCP implementations out there (incorrectly) count the
　* initial SYN frame in their delayed-ACK and congestion control
　* algorithms that we must have the following bandaid to talk
　* efficiently to them.　-DaveM
　*/
tp->snd_cwnd = 2;
/* See draft-stevens-tcpca-spec-01 for discussion of the
　* initialization of these values.
　*/
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
sk->state = TCP_CLOSE;
sk->write_space = tcp_write_space; /* tcp_write_space() 在net/ipv4/tcp.c */
sk->use_write_queue = 1;
sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
　　　/* ipv4_specific 在net/ipv4/tcp_ipv4.c */
sk->sndbuf = sysctl_tcp_wmem[1]; /* 设置发送和接收缓冲区大小 */
sk->rcvbuf = sysctl_tcp_rmem[1]; /* sysctl_tcp_* 在net/ipv4/tcp.c */
atomic_inc(&tcp_sockets_allocated); /* tcp_sockets_allocated是当前TCP socket的数量 */
return 0;
}
SOCK_DGRAM无初始化
1.2 SOCK_RAW初始化
在net/ipv4/raw.c
static int raw_init(struct sock *sk)
{
struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4);
if (sk->num == IPPROTO_ICMP)
　memset(&tp->filter, 0, sizeof(tp->filter));
return 0;
}
2.Server
2.1 bind
static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr=(struct sockaddr_in *)uaddr;
struct sock *sk=sock->sk;
unsigned short snum;
int chk_addr_ret;
int err;
/* If the socket has its own bind function then use it. (RAW) */
if(sk->prot->bind)
　return sk->prot->bind(sk, uaddr, addr_len);
　　　 /* 只有SOCK_RAW定义了自己的bind函数 */
if (addr_len < sizeof(struct sockaddr_in))
　return -EINVAL;
chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
　　　 /* inet_addr_type返回地址的类型 */
　　　 /* 在net/ipv4/fib_frontend.c */
/* Not specified by any standard per-se, however it breaks too
　* many applications when removed.　It is unfortunate since
　* allowing applications to make a non-local bind solves
　* several problems with systems using dynamic addressing.
　* (ie. your servers still start up even if your ISDN link
　*　is temporarily down)
　*/
if (sysctl_ip_nonlocal_bind == 0 &&
　　 sk->protinfo.af_inet.freebind == 0 &&
　　 addr->sin_addr.s_addr != INADDR_ANY &&
　　 chk_addr_ret != RTN_LOCAL &&
　　 chk_addr_ret != RTN_MULTICAST &&
　　 chk_addr_ret != RTN_BROADCAST)
　return -EADDRNOTAVAIL;
snum = ntohs(addr->sin_port);
if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
　　　/* 检查是否有权利bind端口到1-1024 */
　return -EACCES;
/*　　　We keep a pair of addresses. rcv_saddr is the one
　*　　　used by hash lookups, and saddr is used for transmit.
　*
　*　　　In the BSD API these are the same except where it
　*　　　would be illegal to use them (multicast/broadcast) in
　*　　　which case the sending device address is used.
　*/
lock_sock(sk);
/* Check these errors (active socket, double bind). */
err = -EINVAL;
if ((sk->state != TCP_CLOSE)　 ||
　　 (sk->num != 0))
　goto out;
sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
　sk->saddr = 0;　/* Use device */
/* Make sure we are allowed to bind here. */
if (sk->prot->get_port(sk, snum) != 0) { /* get_port检查是否重用 */
　sk->saddr = sk->rcv_saddr = 0;
　err = -EADDRINUSE;
　goto out;
}
if (sk->rcv_saddr)
　sk->userlocks |= SOCK_BINDADDR_LOCK;
if (snum)
　sk->userlocks |= SOCK_BINDPORT_LOCK;
sk->sport = htons(sk->num);
sk->daddr = 0;
sk->dport = 0;
sk_dst_reset(sk);
err = 0;
out:
release_sock(sk);
return err;
}
SOCK_STREAM和SOCK_DGRAM用默认的bind
2.1.1 SOCK_RAW的bind
在net/ipv4/raw.c
static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
int ret = -EINVAL;
int chk_addr_ret;
if (sk->state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
　goto out;
chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
　　　 /* inet_addr_type返回地址的类型 */
　　　 /* 在net/ipv4/fib_frontend.c */
ret = -EADDRNOTAVAIL;
if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
　　 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
　goto out;
sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;
　　　 /* sk->rcv_saddr 捆绑的本地地址 */
　　　 /* sk->saddr 源地址 */
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
　sk->saddr = 0;　/* Use device */ /* 地址类型如为多播或是广播源地址为0 */
sk_dst_reset(sk);
ret = 0;
out: return ret;
}
2.2 listen
2.2.1 SOCK_STREAM的listen
在net/ipv4/af_inet.c
int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
int err;
lock_sock(sk);
err = -EINVAL;
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
　goto out;
old_state = sk->state;
if (!((1<<old_state)&(TCPF_CLOSE|TCPF_LISTEN)))
　goto out;
/* Really, if the socket is already in listen state
　* we can only allow the backlog to be adjusted.
　*/
if (old_state != TCP_LISTEN) {
　err = tcp_listen_start(sk); /* 真正实现TCP协议listen */
　if (err)
　 goto out;
}
sk->max_ack_backlog = backlog;
err = 0;
out:
release_sock(sk);
return err;
}
tcp_listen_start在net/ipv4/tcp.h
int tcp_listen_start(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct tcp_listen_opt *lopt;
sk->max_ack_backlog = 0;
sk->ack_backlog = 0;
tp->accept_queue = tp->accept_queue_tail = NULL;
tp->syn_wait_lock = RW_LOCK_UNLOCKED;
tcp_delack_init(tp);　 /* tp清0 */
　　　/* include/net/tcp.h */
lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
if (!lopt)
　return -ENOMEM;
memset(lopt, 0, sizeof(struct tcp_listen_opt));
for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
　if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
　 break;
write_lock_bh(&tp->syn_wait_lock);
tp->listen_opt = lopt;
write_unlock_bh(&tp->syn_wait_lock);
/* There is race window here: we announce ourselves listening,
　* but this transition is still not validated by get_port().
　* It is OK, because this socket enters to hash table only
　* after validation is complete.
　*/
sk->state = TCP_LISTEN;
if (sk->prot->get_port(sk, sk->num) == 0) { /* 确认地址没有重用 */
　sk->sport = htons(sk->num);　/* 设置源端口 */
　sk_dst_reset(sk);
　sk->prot->hash(sk);　 /* 将端口加到hash表中 */
　return 0;
}
sk->state = TCP_CLOSE;
write_lock_bh(&tp->syn_wait_lock);
tp->listen_opt = NULL;
write_unlock_bh(&tp->syn_wait_lock);
kfree(lopt);
return -EADDRINUSE;
}
SOCK_DGRAM 和 SOCK_RAW 不支持listen
2.3 accept
2.3.1 SOCK_STREAM的accept
在net/ipv4/af_inet.c
int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
struct sock *sk1 = sock->sk;
struct sock *sk2;
int err = -EINVAL;
if((sk2 = sk1->prot->accept(sk1,flags,&err)) == NULL)
　goto do_err;
lock_sock(sk2);
BUG_TRAP((1<<sk2->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_CLOSE));
sock_graft(sk2, newsock);　/* 将sk2转接到newsock */
　　　/* 在include/net/sock.h */
newsock->state = SS_CONNECTED;
release_sock(sk2);
return 0;
do_err:
return err;
}
SOCK_DGRAM 和 SOCK_RAW 不支持 accept
2.3.1.1 TCP协议的accept
在net/ipv4/tcp.c
struct sock *tcp_accept(struct sock *sk, int flags, int *err)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
struct open_request *req;
struct sock *newsk;
int error;
lock_sock(sk);
/* We need to make sure that this socket is listening,
　* and that it has something pending.
　*/
error = -EINVAL;
if (sk->state != TCP_LISTEN)　/* 检查socket是否处于listen状态 */
　goto out;
/* Find already established connection */
if (!tp->accept_queue) {　/* 判断accept队列是否准备好 */
　long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
　　　/* 判断是否为堵塞模式 */
　　　/* 在include/net/sock.h */
　/* If this is a non blocking socket don't sleep */
　error = -EAGAIN;
　if (!timeo)　 /* 不堵塞模式，直接返回 */
　 goto out;
　error = wait_for_connect(sk, timeo); /* 进入空闲等待连接 */
　if (error)
　 goto out;
}
req = tp->accept_queue;
if ((tp->accept_queue = req->dl_next) == NULL)
　tp->accept_queue_tail = NULL;
　newsk = req->sk;
tcp_acceptq_removed(sk);　 /* sk当前连接数减1 */
　　　 /*在include/net/tcp.h */
tcp_openreq_fastfree(req);　 /* 释放内存 */
　　　 /*在include/net/tcp.h */
BUG_TRAP(newsk->state != TCP_SYN_RECV);　
release_sock(sk);
return newsk;
out:
release_sock(sk);
*err = error;
return NULL;
}
/* 只有当socket为堵塞模式，该函数才会被调用 */
/* 在net/ipv4/tcp.c */
static int wait_for_connect(struct sock * sk, long timeo)
{
DECLARE_WAITQUEUE(wait, current);
int err;
/*
　* True wake-one mechanism for incoming connections: only
　* one process gets woken up, not the 'whole herd'.
　* Since we do not 'race & poll' for established sockets
　* anymore, the common case will execute the loop only once.
　*
　* Subtle issue: "add_wait_queue_exclusive()" will be added
　* after any current non-exclusive waiters, and we know that
　* it will always _stay_ after any new non-exclusive waiters
　* because all non-exclusive waiters are added at the
　* beginning of the wait-queue. As such, it's ok to "drop"
　* our exclusiveness temporarily when we get woken up without
　* having to remove and re-insert us on the wait queue.
　*/
add_wait_queue_exclusive(sk->sleep, &wait);
for (;;) {
　current->state = TASK_INTERRUPTIBLE;
　release_sock(sk);
　if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
　 timeo = schedule_timeout(timeo); /* 休眠timeo时长 */
　lock_sock(sk);
　err = 0;
　if (sk->tp_pinfo.af_tcp.accept_queue)　/* accept队列可用 */
　　　　/* 也就是有连接进入 */
　 break;
　err = -EINVAL;
　if (sk->state != TCP_LISTEN)
　 break;
　err = sock_intr_errno(timeo);
　if (signal_pending(current))
　 break;
　err = -EAGAIN;
　if (!timeo)
　 break;
}
current->state = TASK_RUNNING;
remove_wait_queue(sk->sleep, &wait);
return err;
}
3.Client
3.1 connect
3.1.1 SOCK_STREAM的connect
在net/ipv4/af_inet.c

int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
　 int addr_len, int flags)
{
struct sock *sk=sock->sk;
int err;
long timeo;
lock_sock(sk);
if (uaddr->sa_family == AF_UNSPEC) {
　err = sk->prot->disconnect(sk, flags);　/* 关闭连接 */
　sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
　goto out;
}
switch (sock->state) {
default:
　err = -EINVAL;
　goto out;
case SS_CONNECTED:
　err = -EISCONN;
　goto out;
case SS_CONNECTING:
　err = -EALREADY;
　/* Fall out of switch with err, set for this state */
　break;
case SS_UNCONNECTED:
　err = -EISCONN;
　if (sk->state != TCP_CLOSE)
　 goto out;
　err = -EAGAIN;
　if (sk->num == 0) {
　 if (sk->prot->get_port(sk, 0) != 0) /* 是否重用 */
　　goto out;
　 sk->sport = htons(sk->num);
　}
　err = sk->prot->connect(sk, uaddr, addr_len); /* 调用协议的connect */
　if (err < 0)
　 goto out;
　　sock->state = SS_CONNECTING;　 /* socket状态设置成连接中 */
　/* Just entered SS_CONNECTING state; the only
　 * difference is that return value in non-blocking
　 * case is EINPROGRESS, rather than EALREADY.
　 */
　err = -EINPROGRESS;
　break;
}
timeo = sock_sndtimeo(sk, flags&O_NONBLOCK);　/* 是否为堵塞模式 */
　　　　/* 在include/net/sock.h */
if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) { /* 连接为完成 */
　/* Error code is set above */
　if (!timeo || !inet_wait_for_connect(sk, timeo))
　　　 /* 非堵塞模式立即返回 */
　　　 /* 堵塞模式调用inet_wait_for_connect() */
　 goto out;
　err = sock_intr_errno(timeo);
　if (signal_pending(current))
　 goto out;
}
/* Connection was closed by RST, timeout, ICMP error
　* or another process disconnected us.
　*/
if (sk->state == TCP_CLOSE)
　goto sock_error;
/* sk->err may be not zero now, if RECVERR was ordered by user
　* and error was received after socket entered established state.
　* Hence, it is handled normally after connect() return successfully.
　*/
sock->state = SS_CONNECTED;　 /* 设置状态为已连接 */
err = 0;
out:
release_sock(sk);
return err;
sock_error:
err = sock_error(sk) ? : -ECONNABORTED;
sock->state = SS_UNCONNECTED;
if (sk->prot->disconnect(sk, flags))
　sock->state = SS_DISCONNECTING;
goto out;
}
/* 只有当socket为堵塞模式，该函数才会被调用 */
/* 在/net/ipv4/af_inet.c */
static long inet_wait_for_connect(struct sock *sk, long timeo)
{
DECLARE_WAITQUEUE(wait, current);
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(sk->sleep, &wait);
/* Basic assumption: if someone sets sk->err, he _must_
　* change state of the socket from TCP_SYN_*.
　* Connect() does not allow to get error notifications
　* without closing the socket.
　*/
while ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
　release_sock(sk);
　timeo = schedule_timeout(timeo); /* 进入休眠 */
　lock_sock(sk);
　if (signal_pending(current) || !timeo)
　 break;
　set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
remove_wait_queue(sk->sleep, &wait);
return timeo;
}