Journey of a received ACK in response to our SYN-ACK as seen by a TCP server in IPv4 TCP/IP Linux 2.6.18+ ----------------------- Where does ethereal (& presumably tcpdump) hook into the new Linux kernel 2.4.16 using netfilter & iptables? Nowhere. They just call libpcap, and let *it* do the hooking. [:-)] Now, the next question would then be "where does libpcap hook into the new Linux kernel 2.4.16 using netfilter & iptables?" The answer to that question is "the same place it hooks into any other Linux 2.2[.x] or 2.4[.x] kernel - through a PF_PACKET socket." The next question would be "where do PF_PACKETS tap into the network data stream in a 2.4.16 system using netfilter & iptables?" I don't know the answer offhand, and don't have time to search for it (either in documentation or, as I fear would be required, the code); I'd suggest asking on, say, the linux-net mailing list, if nobody else on this list happens to know the answer. // allocated net/core/sock.c::sk_alloc() // receive net/core/sock.c::sk_receive_skb() include/net/sock.h::sock_put() // used net/core/sock.c::sock_alloc_send_skb() net/core/sock.c::sock_alloc_send_pskb() ------------------------ Where does Netfilters hook into the kernel for IPv4 ? net/ipv4/arp.c:954: return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); net/ipv4/ip_output.c:158: return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, net/ipv4/ip_output.c:364: return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, net/ipv4/ip_input.c:275: return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL, net/ipv4/ip_input.c:434: return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, net/ipv4/igmp.c:357: return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev, net/ipv4/igmp.c:692: return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, net/ipv4/ip_forward.c:106: return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev, ------------------------- ------------------------- Socket changing state to TCP_ESTABLISHED //=include/net/tcp.h::tcp_set_state(sock *, int) { switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) TCP_INC_STATS(TCP_MIB_CURRESTAB); break; /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ sk->sk_state = state; } //=include/net/tcp.h::tcp_set_state() ------------------------- Socket promotion from SYN_RECEIVED to TCP_ESTABLISHED //=include/linux/net.h::struct proto_ops->accept //=net/ipv4/af_inet.c::const struct proto_ops inet_stream_ops->accept //=net/ipv4/af_inet.c::inet_accept(socket*, socket*, int) { struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); //=include/net/sock.h::proto->accept (sock, int, int*) //=net/ipv4/af_inet.c::struct inet_protosw inetsw_array[] = { .type = SOCK_STREAM, .prot = &tcp_prot, //=net/ipv4/tcp_ipv4.c::struct proto tcp_prot = { .accept = inet_csk_accept, //=net/ipv4/inet_connection_sock.c::inet_csk_accept(sock*, int, int*) { newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); out: release_sock(sk); return newsk; } //=net/ipv4/inet_connection_sock.c::inet_csk_accept() } //=net/ipv4/tcp_ipv4.c::struct proto tcp_prot } //=net/ipv4/af_inet.c::struct inet_protosw inetsw_array }=net/ipv4/af_inet.c::inet_accept() { ------------------------- // Socket in state SYN_RECEIVED // Receive packet //=net/core/dev.c::net_rx_action() { //=net/core/dev.c::netif_receive_skb() { //=net/ipv4/ip_input.c:ip_rcv() { //=net/ipv4/ipv_input.c:ip_rcv_finish() { //=net/ipv4/ip_input.c:ip_local_deliver() { //=net/ipv4/ip_input.c:ip_local_deliver_finish() { ret = ipprot->handler(skb); //=net/ipv4/af_inet.c::struct net_protocol tcp_protocol->handler //=net/ipv4/tcp_ipv4.c::tcp_v4_rcv() { TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); if (!sock_owned_by_user(sk)) { { if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); } } else sk_add_backlog(sk, skb); //=net/ipv4/tcp_ipv4.c::tcp_v4_do_rcv() { if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v4_hnd_req(sk, skb); if (!nsk) goto discard; // ... } // ******* TCP_ESTABLISHED is set at the end of the function, so skip down to avoid getting a headache! ***** //=net/ipv4/tcp_ipv4.c::tcp_v4_hnd_req() { /* Find possible connection requests. */ struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,iph->saddr, iph->daddr); if (req) return tcp_check_req(sk, skb, req, prev); //=net/ipv4/tcp_minisocks.c::tcp_check_req() { /* * Process an incoming packet for SYN_RECV sockets represented * as a request_sock. */ /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. */ if (!(flg & TCP_FLAG_ACK)) return NULL; /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; return NULL; } /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); //=net/ipv4/tcp_ipv4.c::struct inet_connection_sock_af_ops ipv4_specific->syn_recv_sock //=net/ipv4/tcp_ipv4.c::tcp_v4_syn_recv_sock() { /* * The three way handshake has completed - we got a valid synack - * now create the new socket. */ if (sk_acceptq_is_full(sk)) goto exit_overflow; if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) goto exit; // **************************** PROMOTION to FULL SOCKET ************************ newsk = tcp_create_openreq_child(sk, req, skb); //=net/ipv4/tcp_minisocks.c::tcp_create_openreq_child() { tcp_set_ca_state(newsk, TCP_CA_Open); //=include/net/tcp.h::tcp_set_ca_state() { //=include/linux/tcp.h::enum tcp_ca_state { TCP_CA_Open = 0, #define TCPF_CA_Open (1<icsk_ca_ops //=include/net/tcp.h::struct tcp_congestion_ops /* * Interface for adding new TCP congestion control handlers */ /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); }//=include/net/tcp.h::struct tcp_congestion_ops }//=include/net/inet_connection_sock.h::struct inet_connection_sock if (icsk->icsk_ca_ops->set_state) icsk->icsk_ca_ops->set_state(sk, ca_state); //=net/ipv4/tcp_bic.c::bictcp_state() { if (new_state == TCP_CA_Loss) bictcp_reset(inet_csk_ca(sk)); } //=net/ipv4/tcp_bic.c::bictcp_state() icsk->icsk_ca_state = ca_state; } //=include/net/tcp.h::tcp_set_ca_state() } //=net/ipv4/tcp_minisocks.c::tcp_create_openreq_child() if (!newsk) goto exit; newsk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(newsk, dst); __inet_hash(&tcp_hashinfo, newsk, 0); __inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; exit_overflow: NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); exit: NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); dst_release(dst); return NULL; } //=net/ipv4/tcp_ipv4.c::tcp_v4_syn_recv_sock() if (child == NULL) goto listen_overflow; inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); inet_csk_reqsk_queue_add(sk, req, child); return child; listen_overflow: if (!sysctl_tcp_abort_on_overflow) { inet_rsk(req)->acked = 1; return NULL; } embryonic_reset: NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS); if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_reset(sk, skb); inet_csk_reqsk_queue_drop(sk, req, prev); return NULL; } //=net/ipv4/tcp_minisocks.c::tcp_check_req() } //=net/ipv4/tcp_ipv4.c::tcp_v4_hnd_req() TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) //=net/ipv4/tcp_input::tcp_rcv_state_process(sock*, sk_buff*, tcphdr*, unsigned) { /* step 5: check the ACK field */ if (th->ack) { int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); switch(sk->sk_state) { case TCP_SYN_RECV: if (acceptable) { tp->copied_seq = tp->rcv_nxt; mb(); // **************************** FINALLY TCP_ESTABLISHED ************************ tcp_set_state(sk, TCP_ESTABLISHED); // ***************************************************************************** sk->sk_state_change(sk); // ... } else { return 1 } break; } } //=net/ipv4/tcp_input::tcp_rcv_state_process() goto reset; TCP_CHECK_TIMER(sk); return 0; } //=net/ipv4/tcp_ipv4.c::tcp_v4_do_rcv() } //=net/ipv4/tcp_ipv4.c::tcp_v4_rcv() } //=net/ipv4/ip_input.c:ip_local_deliver_finish() return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL, ip_local_deliver_finish); } //=net/ipv4/ip_input.c:ip_local_deliver() } //=net/ipv4/ipv_input.c:ip_rcv_finish() return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish); } //=net/ipv4/ip_input.c:ip_rcv() // Socket in state ESTABLISHED /* Notes See Apache directive for TCP_DEFER_ACCEPT http://httpd.apache.org/docs/trunk/mod/core.html#acceptfilter man 7 tcp: SOCKET OPTIONS To set or get a TCP socket option, call getsockopt(2) to read or setsockopt(2) to write TCP_DEFER_ACCEPT Allows a listener to be awakened only when data arrives on the socket. Takes an integer value (seconds), this can bound the max‐ imum number of attempts TCP will make to complete the connection. This option should not be used in code intended to be portable. */