// 普通的 BSD 标准 socket 结构体 // socket_state: socket 状态, 连接?不连接? // type: socket type (%SOCK_STREAM, etc) // flags: socket flags (%SOCK_NOSPACE, etc) // ops: 专用协议的socket的操作 // file: 与socket 有关的指针列表 // sk: 负责协议相关结构体,这样就让这个这个结构体和协议分开。 // wq: 等待队列 struct socket { socket_state state; kmemcheck_bitfield_begin(type); short type; kmemcheck_bitfield_end(type); unsigned long flags; struct socket_wq __rcu *wq; struct file *file; struct sock *sk; const struct proto_ops *ops; };
// socket() 本质上是 glibc 中的函数,执行的实际上是 sys_socketcall() 系统调用。 // sys_socketcall() 几乎是所有的socket函数的入口, // 也就是 bind,connect 等函数都是需要asmlinkage long sys_socketcall(int call, unsigned long __user *args); 、、sys_socketcall() 作为入口,函数如下: // include/linux/syscalls.h asmlinkage long sys_socketcall(int call, unsigned long __user *args);
// net/socket.c SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { unsigned long a[AUDITSC_ARGS]; unsigned long a0, a1; int err; unsigned int len; if (call < 1 || call > SYS_SENDMMSG) return -EINVAL; len = nargs[call]; if (len > sizeof(a)) return -EINVAL; /* copy_from_user should be SMP safe. */ if (copy_from_user(a, args, len)) return -EFAULT; err = audit_socketcall(nargs[call] / sizeof(unsigned long), a); if (err) return err; a0 = a[0]; a1 = a[1]; // 判断,然后运行相对应的函数 switch (call) { case SYS_SOCKET: // 这里就是 sys_socket(), err = sys_socket(a0, a1, a[2]); break; case SYS_BIND: err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: err = sys_listen(a0, a1); break; // ... ... default: err = -EINVAL; break; } return err; }
// include/linux/syscalls.h asmlinkage long sys_socket(int, int, int); // net/socket.c SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) { int retval; struct socket *sock; int flags; /* Check the SOCK_* constants for consistency. */ BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; // 这里创建了 socket 结构体 retval = sock_create(family, type, protocol, &sock); if (retval < 0) goto out; // 与文件系统进行关联 retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); if (retval < 0) goto out_release; out: /* It may be already another descriptor 8) Not kernel problem. */ return retval; out_release: sock_release(sock); return retval; }
// include/linux/net.h int sock_create(int family, int type, int proto, struct socket **res); // net/socket.c int sock_create(int family, int type, int protocol, struct socket **res) { return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); } EXPORT_SYMBOL(sock_create); // include/linux/net.h int __sock_create(struct net *net, int family, int type, int proto, struct socket **res, int kern); // net/socket.c int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { int err; struct socket *sock; const struct net_proto_family *pf; /* * Check protocol is in range */ // 检查 协议族是否在范围呢 if (family < 0 || family >= NPROTO) return -EAFNOSUPPORT; if (type < 0 || type >= SOCK_MAX) // 检查类型 return -EINVAL; /* Compatibility. This uglymoron is moved from INET layer to here to avoid deadlock in module load. */ // 检查用的是PF_INET 其实这个都是兼容的。 if (family == PF_INET && type == SOCK_PACKET) { static int warned; if (!warned) { warned = 1; pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm); } family = PF_PACKET; } // 安全机制检查 err = security_socket_create(family, type, protocol, kern); if (err) return err; /* * Allocate the socket and allow the family to set things up. if * the protocol is 0, the family is instructed to select an appropriate * default. */ // ----> sock_alloc 接下面 sock = sock_alloc(); if (!sock) { net_warn_ratelimited("socket: no more sockets\n"); return -ENFILE; /* Not exactly a match, but its the closest posix thing */ } sock->type = type; // ... ... return 0; // ... ... } EXPORT_SYMBOL(__sock_create);
sock_alloc()
函数解析,被上面的 __sock_create()
函数调用
// net/socket.c static struct socket *sock_alloc(void) { struct inode *inode; struct socket *sock; inode = new_inode_pseudo(sock_mnt->mnt_sb); if (!inode) return NULL; sock = SOCKET_I(inode); kmemcheck_annotate_bitfield(sock, type); inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; // 模式 inode->i_uid = current_fsuid(); // 获取当前的uid inode->i_gid = current_fsgid(); // 获取当前的gid inode->i_op = &sockfs_inode_ops; // 操作 this_cpu_add(sockets_in_use, 1); return sock; } // 申请一个 socket 结构体 ,名字为 sock // 申请一个新的节点和一个新的 socket 项目, 绑定他们两个并且初始化 // 如果申请inode 失败返回 NULL, 或者返回sock
// 接下来我们再看到 SOCKET_I(inode); // include/net/sock.h static inline struct socket *SOCKET_I(struct inode *inode) { return &container_of(inode, struct socket_alloc, vfs_inode)->socket; } // 然后我们发现,返回的是 inode 内的socket 结构体。 // 我们可以分析一个 container_of() 这个是怎么定义的。 // include/linux/kernel.h #define container_of(ptr, type, member) ({ \ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type,member) );}) // typeof 将 ptr 的指针临时保存起来为 __mptr // 然后用这个 __mptr 指针减去下面的 member 的便宜量。 // 得到的就是 type 这个结构体的头指针。 // offsetof include/linux/stddef.h #undef offsetof #ifdef __compiler_offsetof #define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER) #else #define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) #endif // 反正这里有点难理解,最后得到的结果是 type 这个结构体的头指针。 // 所以说 SOCKET_I() 得到的是 struct socket_alloc 的头指针 // include/net/sock.h struct socket_alloc { struct socket socket; struct inode vfs_inode; };
- #### 回到 __sock_create() 继续分析
// net/socket.c --> __sock_create() #ifdef CONFIG_MODULES /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user * requested real, full-featured networking support upon configuration. * Otherwise module support will break! */ if (rcu_access_pointer(net_families[family]) == NULL) request_module("net-pf-%d", family); #endif
如果在 make menuconfig 中选上 编译成模块的选项,则会运行上面这个部分。 里面先是检查对应的协议族的操作表是否已经安装,如果没有安装则使用 request_module 进行安装,现在都是在 TCP/IP协议下进行分析,所以 family 是 AF_INET , 也就是 2 , 所以实际检查的全局变量是 net_families[2], 这个全局变量是在系统初始化时由 net/ipv4/af_inet.c 文件进行安装,具体代码是:
// net/ipv4/af_inet.c static int __init inet_init(void) { struct inet_protosw *q; struct list_head *r; int rc = -EINVAL; sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); // 各个协议的注册 rc = proto_register(&tcp_prot, 1); if (rc) goto out; rc = proto_register(&udp_prot, 1); if (rc) goto out_unregister_tcp_proto; rc = proto_register(&raw_prot, 1); if (rc) goto out_unregister_udp_proto; rc = proto_register(&ping_prot, 1); if (rc) goto out_unregister_raw_proto; /* * Tell SOCKET that we are alive... */ (void)sock_register(&inet_family_ops); #ifdef CONFIG_SYSCTL ip_static_sysctl_init(); #endif /* * Add all the base protocols. */ // 各个协议的添加,添加不成功则报错 if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) pr_crit("%s: Cannot add ICMP protocol\n", __func__); if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) pr_crit("%s: Cannot add UDP protocol\n", __func__); if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) pr_crit("%s: Cannot add TCP protocol\n", __func__); #ifdef CONFIG_IP_MULTICAST if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) pr_crit("%s: Cannot add IGMP protocol\n", __func__); #endif /* Register the socket-side information for inet_create. */ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); // 把这个关键性的链接表一个个注册上去
// ****************************************************** // inetsw_array 结构体数组数组, 这里面有包含每个的协议,比如说tcp_prot static struct inet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, .flags = INET_PROTOSW_PERMANENT, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_ICMP, .prot = &ping_prot, .ops = &inet_dgram_ops, .flags = INET_PROTOSW_REUSE, }, // ... ... } // tcp_prot ---> net/ipv4/tcp_ipv4.c struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, // 这是init 函数会在后面被调用 .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_DESTROY_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, .no_autobind = true, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif #ifdef CONFIG_MEMCG_KMEM .init_cgroup = tcp_init_cgroup, .destroy_cgroup = tcp_destroy_cgroup, .proto_cgroup = tcp_proto_cgroup, #endif }; EXPORT_SYMBOL(tcp_prot); // ***********************************************************
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) inet_register_protosw(q); // 各个协议模块的初始化 /* * Set the ARP module up */ arp_init(); /* * Set the IP module up */ ip_init(); tcp_v4_init(); /* Setup TCP slab cache for open requests. */ tcp_init(); /* Setup UDP memory threshold */ udp_init(); /* Add UDP-Lite (RFC 3828) */ udplite4_register(); ping_init(); /* * Set the ICMP layer up */ if (icmp_init() < 0) panic("Failed to create the ICMP control socket.\n"); /* * Initialise the multicast router */ #if defined(CONFIG_IP_MROUTE) if (ip_mr_init()) pr_crit("%s: Cannot init ipv4 mroute\n", __func__); #endif if (init_inet_pernet_ops()) pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); /* * Initialise per-cpu ipv4 mibs */ if (init_ipv4_mibs()) pr_crit("%s: Cannot init ipv4 mibs\n", __func__); ipv4_proc_init(); ipfrag_init(); dev_add_pack(&ip_packet_type); ip_tunnel_core_init(); rc = 0; out: return rc; out_unregister_raw_proto: proto_unregister(&raw_prot); out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); goto out; } fs_initcall(inet_init);
- #### 很粗浅的看完协议那一部分之后我们回到 __sock_create()
// net/socket.c // 看到 这个回调函数的调用 err = pf->create(net, sock, protocol, kern); if (err < 0) goto out_module_put;
// 先看一个 inet_protosw 结构体 // include/net/protocol.h /* This is used to register socket interfaces for IP protocols. */ struct inet_protosw { struct list_head list; /* These two fields form the lookup key. */ unsigned short type; /* This is the 2nd argument to socket(2). */ unsigned short protocol; /* This is the L4 protocol number. */ struct proto *prot; const struct proto_ops *ops; unsigned char flags; /* See INET_PROTOSW_* below. */ }; // 上面的 create 函数对应的是 net/ipv4/af_inet.c 里面的 inet_create 函数 // net/ipv4/af_inet.c static int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; int try_loading_module = 0; int err; // 检查协议是否在范围之内 if (protocol < 0 || protocol >= IPPROTO_MAX) return -EINVAL; // 设置状态为未连接 sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ // 遍历寻找请求的协议类型 lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); // 遍历 inetsw[] 数组,其实就是次数而已 list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { err = 0; // 检查对应的协议,然后再选择合适的协议 /* Check the non-wild match. */ // 找到对应的协议,如果找到对应的协议,但是protocol 不是 IPPRORO_IP,则直接退出 if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } // 如果没有对应的协议则返回错误码 err = -EPROTONOSUPPORT; } // 如果没有加载模块的保护措施 if (unlikely(err)) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET, protocol); goto lookup_protocol; } else goto out_rcu_unlock; } err = -EPERM; // 检查通用性,只有root 权限然后使用原始套接字 if (sock->type == SOCK_RAW && !kern && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; // 对socket 的操作集合进行了互联。 sock->ops = answer->ops; answer_prot = answer->prot; answer_flags = answer->flags; rcu_read_unlock(); WARN_ON(!answer_prot->slab); err = -ENOBUFS; /* 此处调用sk_alloc分配一个struct sock,该结构体庞大,其作用是网络层对socket的表示,意思就是IP协议下有很多东西比如IP地址,网卡接口,端口等等信息需要再socket层中有所体现从而使编程者方便使用,然后就利用指针等形式把内容进行一定程度上的映射。sk_alloc首先对sock->proto和sock_creator进行设置,设置成当前协议对应的proto调用sk_prot_alloc()根据是否提供了slab缓存而判断是使用slab缓存还是通用缓存。只要分配成功,则调用sock_lock_init()对缓存进行初始化,主要是对sock锁、等待队列以及进程数据结构中的网络空间结构进行分配。初始化完了后调用sock_net_set()函数对网络空间结构进行记录,然后最后增加一个net计数器。至此回到inet_create,判断是否成功分配 */ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; err = 0; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; // 返回一个 struct inet_sock 的指针给 inet inet = inet_sk(sk); // 判断是不是面向连通 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; inet->nodefrag = 0; // 判断是不是原始套接字,如果是,新建IP头部。 if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet->hdrincl = 1; } // 判断是否采用路径 MTU 发现算法 if (net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; inet->inet_id = 0; // 进一步初始化结构体 sk (struct sock) // sock_init_data: 初始化接收,发送,错误信息队列,三个队列都是双向链表,属于sk_buff_head 结构体,其中会把 sk_buff 结构体串联在一起,初始化数据包发送定时器,变量,(主要是函数指针) sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; inet->uc_ttl = -1; inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_all = 1; inet->mc_index = 0; inet->mc_list = NULL; inet->rcv_tos = 0; sk_refcnt_debug_inc(sk); if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ sk->sk_prot->hash(sk); } // 这里,就是调用了协议里面的 init 函数 tcp_v4_init_sock if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); if (err) sk_common_release(sk); } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; }
static int tcp_v4_init_sock(struct sock *sk) { // 强制转换类型 struct inet_connection_sock *icsk = inet_csk(sk); // 调用这个进行初始化 ,里面就时关于tcp 的一些初始化了,到此为止 tcp_init_sock(sk); // ipv4 专用操作 icsk->icsk_af_ops = &ipv4_specific; #ifdef CONFIG_TCP_MD5SIG tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; #endif return 0; }
到此, sock_create 分析完毕
最后回到 SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
// net/socket.c // 刚才分析完毕 retval = sock_create(family, type, protocol, &sock); if (retval < 0) goto out; // socket 映射到文件系统 retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); if (retval < 0) goto out_release;
// net/socket.c static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); if (unlikely(fd < 0)) return fd; // 申请一个 sock file 节点 newfile = sock_alloc_file(sock, flags, NULL); if (likely(!IS_ERR(newfile))) { fd_install(fd, newfile); return fd; } put_unused_fd(fd); return PTR_ERR(newfile); } // 这里所展现的意思是,把socket当成一个文件节点进行操作,open, read,write ,ioctl 等