首页 > 代码库 > TCP连接的建立(二)
TCP连接的建立(二)
被动打开
SYN cookies
TCP协议开辟了一个比较大的内存空间请求连接队列来存储连接请求块,当SYN请求不断增加,请求连接数目到达上限时,会致使系统丢弃SYN连接请求。SYN cookies技术就可以使服务器在半连接队列已满的情况下仍能处理新的SYN请求。
当半连接队列满时,SYN cookies并不丢弃SYN请求,而是通过加密技术来标识半连接状态。在TCP实现中,当收到客户端的SYN请求时,服务器需要回复SYN+ACK包给客户端,然后客户端再发送确认包给服务器。通常,服务器的初始序列号是由服务器按照一定的规律计算得到的随机数,而在SYN cookies中,服务器的初始序列号是由客户端IP地址、客户端端口号、服务器IP地址和服务器端口号、接收到的客户端初始序列号以及其他一些安全数值进行hash运算,并加密后得到的,称之为cookies。当服务器遭受SYN攻击使得请求连接队列满时,服务器并不拒绝新的SYN请求,而是回复一个初始化序列号为cookies的SYN包给客户端,如果收到客户端的ACK段,服务器将客户端的ACK序列号减1得到的值,与用上述那些要素hash运算得到的值比较,如果相等,直接完成三次握手。注意:此时并不比查看此连接是否属于请求连接队列。
启用SYN cookies是通过在启动环境中设置一下命令来完成:
echo 1 > /proc/sys/net/ipv4/tcp_syncookies
第一次握手:接收SYN段
传输控制块接收处理的段都有tcp_v4_do_rcv()处理,在该函数中再根据不同的状态由不同的函数处理;
/* The socket must have it's spinlock held when we get * here. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held. */ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; #ifdef CONFIG_TCP_MD5SIG /* * We really want to reject the packet as early as possible * if: * o We're expecting an MD5'd packet and this is no MD5 tcp option * o There is an MD5 option and we're not expecting one */ if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard; #endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ TCP_CHECK_TIMER(sk); if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } TCP_CHECK_TIMER(sk); return 0; } if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v4_hnd_req(sk, skb); if (!nsk) goto discard; if (nsk != sk) { if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; } return 0; } } TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } TCP_CHECK_TIMER(sk); return 0; reset: tcp_v4_send_reset(rsk, skb); discard: kfree_skb(skb); /* Be careful here. If this function gets more complicated and * gcc suffers from register pressure on the x86, sk (in %ebx) * might be destroyed here. This current version compiles correctly, * but you have been warned. */ return 0; csum_err: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; }第二次握手:发送SYN+ACK段
tcp_v4_send_synack()用来为服务端构造回应客户端连接请求SYN段的SYN+ACK段,并将其封装在IP数据报中发送给客户端。
/* * Send a SYN-ACK after having received a SYN. * This still operates on a request_sock only, not on a big * socket. */ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, struct dst_entry *dst) { const struct inet_request_sock *ireq = inet_rsk(req); int err = -1; struct sk_buff * skb; /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) return -1; skb = tcp_make_synack(sk, dst, req); if (skb) { struct tcphdr *th = tcp_hdr(skb); th->check = tcp_v4_check(skb->len, ireq->loc_addr, ireq->rmt_addr, csum_partial(th, skb->len, skb->csum)); err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, ireq->rmt_addr, ireq->opt); err = net_xmit_eval(err); } dst_release(dst); return err; } static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req) { return __tcp_v4_send_synack(sk, req, NULL); }第三次握手:接收ACK段
服务端接收到SYN段后,会为将建立的连接创建一个连接请求块,同时发送SYN+ACK段给客户端作为回应,然后启动建立连接定时器,等待客户端最后一次握手的ACK段
connect系统调用的实现
inet_stream_connect()是connect系统调用的套接口层实现,首先校验设置的地址族,然后校验套接口状态,套接口状态为SS_UNCONNECTED时调用传输层接口,TCP中为tcp_v4_connect(),最后,等待连接的完成或失败。
/* * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; int err; long timeo; lock_sock(sk); if (uaddr->sa_family == AF_UNSPEC) { err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; goto out; } switch (sock->state) { default: err = -EINVAL; goto out; case SS_CONNECTED: err = -EISCONN; goto out; case SS_CONNECTING: err = -EALREADY; /* Fall out of switch with err, set for this state */ break; case SS_UNCONNECTED: err = -EISCONN; if (sk->sk_state != TCP_CLOSE) goto out; err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; sock->state = SS_CONNECTING; /* Just entered SS_CONNECTING state; the only * difference is that return value in non-blocking * case is EINPROGRESS, rather than EALREADY. */ err = -EINPROGRESS; break; } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { /* Error code is set above */ if (!timeo || !inet_wait_for_connect(sk, timeo)) goto out; err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } /* Connection was closed by RST, timeout, ICMP error * or another process disconnected us. */ if (sk->sk_state == TCP_CLOSE) goto sock_error; /* sk->sk_err may be not zero now, if RECVERR was ordered by user * and error was received after socket entered established state. * Hence, it is handled normally after connect() return successfully. */ sock->state = SS_CONNECTED; err = 0; out: release_sock(sk); return err; sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; if (sk->sk_prot->disconnect(sk, flags)) sock->state = SS_DISCONNECTING; goto out; }调用传输层接口,连接需要三层握手,connect接口只是完成发送SYN段过程,后续两次握手由协议栈完成。SYN段发送成功后,后续只需等待第三次握手结束。
主动打开
第一次握手:发送SYN段
初始化客户端传输控制块并发送SYN段,通过tcp_v4_connect()完成
/* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct rtable *rt; __be32 daddr, nexthop; int tmp; int err; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; nexthop = daddr = usin->sin_addr.s_addr; if (inet->opt && inet->opt->srr) { if (!daddr) return -EINVAL; nexthop = inet->opt->faddr; } tmp = ip_route_connect(&rt, nexthop, inet->saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, inet->sport, usin->sin_port, sk, 1); if (tmp < 0) { if (tmp == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return tmp; } if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { ip_rt_put(rt); return -ENETUNREACH; } if (!inet->opt || !inet->opt->srr) daddr = rt->rt_dst; if (!inet->saddr) inet->saddr = rt->rt_src; inet->rcv_saddr = inet->saddr; if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { /* Reset inherited state */ tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; tp->write_seq = 0; } if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { struct inet_peer *peer = rt_get_peer(rt); /* * VJ's idea. We save last timestamp seen from * the destination in peer table, when entering state * TIME-WAIT * and initialize rx_opt.ts_recent from it, * when trying new connection. */ if (peer != NULL && peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; tp->rx_opt.ts_recent = peer->tcp_ts; } } inet->dport = usin->sin_port; inet->daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt) inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; tp->rx_opt.mss_clamp = 536; /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket * lock select source port, enter ourselves into the hash tables and * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk); if (err) goto failure; /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->u.dst); if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->saddr, inet->daddr, inet->sport, usin->sin_port); inet->id = tp->write_seq ^ jiffies; err = tcp_connect(sk); rt = NULL; if (err) goto failure; return 0; failure: /* * This unhashes the socket and releases the local port, * if necessary. */ tcp_set_state(sk, TCP_CLOSE); ip_rt_put(rt); sk->sk_route_caps = 0; inet->dport = 0; return err; }第二次握手:接收SYN+ACK段
处于SYN_SENT状态的传输控制块,通过tcp_rcv_state_process()来处理。
/* * This function implements the receiving procedure of RFC 793 for * all states except ESTABLISHED and TIME_WAIT. * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be * address independent. */ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; int res; tp->rx_opt.saw_tstamp = 0; switch (sk->sk_state) { case TCP_CLOSE: goto discard; case TCP_LISTEN: if (th->ack) return 1; if (th->rst) goto discard; if (th->syn) { if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; /* Now we have several options: In theory there is * nothing else in the frame. KA9Q has an option to * send data with the syn, BSD accepts data with the * syn up to the [to be] advertised window and * Solaris 2.1 gives you a protocol error. For now * we just ignore it, that fits the spec precisely * and avoids incompatibilities. It would be nice in * future to drop through and process the data. * * Now that TTCP is starting to be used we ought to * queue this data. * But, this leaves one open to an easy denial of * service attack, and SYN cookies can't defend * against this problem. So, we drop the data * in the interest of security over speed unless * it's still in use. */ kfree_skb(skb); return 0; } goto discard; case TCP_SYN_SENT: queued = tcp_rcv_synsent_state_process(sk, skb, th, len); if (queued >= 0) return queued; /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); tcp_data_snd_check(sk); return 0; } res = tcp_validate_incoming(sk, skb, th, 0); if (res <= 0) return -res; /* step 5: check the ACK field */ if (th->ack) { int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; switch (sk->sk_state) { case TCP_SYN_RECV: if (acceptable) { tp->copied_seq = tp->rcv_nxt; smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); /* Note, that this wakeup is only for marginal * crossed SYN case. Passively open sockets * are not waked up, because sk->sk_sleep == * NULL and sk->sk_socket == NULL. */ if (sk->sk_socket) sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); /* tcp_ack considers this ACK as duplicate * and does not calculate rtt. * Force it here. */ tcp_ack_update_rtt(sk, 0, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; /* Make sure socket is routed, for * correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); tcp_init_congestion_control(sk); /* Prevent spurious tcp_cwnd_restart() on * first data packet. */ tp->lsndtime = tcp_time_stamp; tcp_mtup_init(sk); tcp_initialize_rcv_mss(sk); tcp_init_buffer_space(sk); tcp_fast_path_on(tp); } else { return 1; } break; case TCP_FIN_WAIT1: if (tp->snd_una == tp->write_seq) { tcp_set_state(sk, TCP_FIN_WAIT2); sk->sk_shutdown |= SEND_SHUTDOWN; dst_confirm(sk->sk_dst_cache); if (!sock_flag(sk, SOCK_DEAD)) /* Wake up lingering close() */ sk->sk_state_change(sk); else { int tmo; if (tp->linger2 < 0 || (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { tcp_done(sk); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; } tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing * and not so rare event. We still can lose it now, * if it spins in bh_lock_sock(), but it is really * marginal case. */ inet_csk_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto discard; } } } break; case TCP_CLOSING: if (tp->snd_una == tp->write_seq) { tcp_time_wait(sk, TCP_TIME_WAIT, 0); goto discard; } break; case TCP_LAST_ACK: if (tp->snd_una == tp->write_seq) { tcp_update_metrics(sk); tcp_done(sk); goto discard; } break; } } else goto discard; /* step 6: check the URG bit */ tcp_urg(sk, skb, th); /* step 7: process the segment text */ switch (sk->sk_state) { case TCP_CLOSE_WAIT: case TCP_CLOSING: case TCP_LAST_ACK: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, * RFC 1122 says we MUST send a reset. * BSD 4.4 also does reset. */ if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk); return 1; } } /* Fall through */ case TCP_ESTABLISHED: tcp_data_queue(sk, skb); queued = 1; break; } /* tcp_data could move socket to TIME-WAIT */ if (sk->sk_state != TCP_CLOSE) { tcp_data_snd_check(sk); tcp_ack_snd_check(sk); } if (!queued) { discard: __kfree_skb(skb); } return 0; }第三次握手:发送ACK段
tcp_send_ack()用来发送一个ACK段,同时更新窗口
/* This routine sends an ack and also updates the window. */ void tcp_send_ack(struct sock *sk) { struct sk_buff *buff; /* If we have been reset, we may not send again. */ if (sk->sk_state == TCP_CLOSE) return; /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. */ buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (buff == NULL) { inet_csk_schedule_ack(sk); inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); }发送ACK段时,TCP必须不在CLOSE状态。
为ACK段分配一个SKB,如果分配失败则在启动延时定时器后返回。
TCP连接的建立(二)