Initial commit from project. lib dependencies need checking
[packeteer.git] / docs / Journey of a received TCP packet.txt
1 Journey of a received ACK in response to our SYN-ACK as seen by a TCP server in IPv4 TCP/IP
2
3 Linux 2.6.18+
4
5 -----------------------
6
7 Where does ethereal (& presumably tcpdump) hook into the new Linux kernel 2.4.16 using netfilter & iptables? 
8
9         Nowhere. They just call libpcap, and let *it* do the hooking. [:-)]
10
11 Now, the next question would then be "where does libpcap hook into the
12 new Linux kernel 2.4.16 using netfilter & iptables?"
13
14         The answer to that question is "the same place it hooks into any other
15         Linux 2.2[.x] or 2.4[.x] kernel - through a PF_PACKET socket."
16
17 The next question would be "where do PF_PACKETS tap into the network
18 data stream in a 2.4.16 system using netfilter & iptables?"
19
20         I don't know the answer offhand, and don't have time to search for it
21         (either in documentation or, as I fear would be required, the code); I'd
22         suggest asking on, say, the linux-net mailing list, if nobody else on
23         this list happens to know the answer.
24
25 // allocated
26 net/core/sock.c::sk_alloc()
27
28 // receive
29 net/core/sock.c::sk_receive_skb()
30  include/net/sock.h::sock_put()
31
32 // used
33 net/core/sock.c::sock_alloc_send_skb()
34  net/core/sock.c::sock_alloc_send_pskb()
35
36 ------------------------
37
38 Where does Netfilters hook into the kernel for IPv4 ?
39
40 net/ipv4/arp.c:954:     return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
41 net/ipv4/ip_output.c:158:       return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
42 net/ipv4/ip_output.c:364:       return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
43 net/ipv4/ip_input.c:275:        return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
44 net/ipv4/ip_input.c:434:        return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
45 net/ipv4/igmp.c:357:    return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev,
46 net/ipv4/igmp.c:692:    return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
47 net/ipv4/ip_forward.c:106:      return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev,
48
49
50 -------------------------
51
52
53
54
55 -------------------------
56 Socket changing state to TCP_ESTABLISHED
57
58 //=include/net/tcp.h::tcp_set_state(sock *, int) {
59
60  switch (state) {
61   case TCP_ESTABLISHED:
62    if (oldstate != TCP_ESTABLISHED)
63     TCP_INC_STATS(TCP_MIB_CURRESTAB);
64    break;
65
66  /* Change state AFTER socket is unhashed to avoid closed
67   * socket sitting in hash tables.
68   */
69  sk->sk_state = state;
70
71 } //=include/net/tcp.h::tcp_set_state()
72  
73 -------------------------
74 Socket promotion from SYN_RECEIVED to TCP_ESTABLISHED
75
76 //=include/linux/net.h::struct proto_ops->accept
77 //=net/ipv4/af_inet.c::const struct proto_ops inet_stream_ops->accept
78 //=net/ipv4/af_inet.c::inet_accept(socket*, socket*, int) {
79
80   struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
81
82   //=include/net/sock.h::proto->accept (sock, int, int*)
83   //=net/ipv4/af_inet.c::struct inet_protosw inetsw_array[] = {
84   
85    .type =       SOCK_STREAM,
86    .prot =       &tcp_prot,
87    //=net/ipv4/tcp_ipv4.c::struct proto tcp_prot = {
88     
89     .accept                     = inet_csk_accept,
90     //=net/ipv4/inet_connection_sock.c::inet_csk_accept(sock*, int, int*) {
91
92      newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
93      BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
94     out:
95      release_sock(sk);
96      return newsk;    
97     
98     } //=net/ipv4/inet_connection_sock.c::inet_csk_accept()
99   
100    } //=net/ipv4/tcp_ipv4.c::struct proto tcp_prot
101   
102   } //=net/ipv4/af_inet.c::struct inet_protosw inetsw_array
103    
104 }=net/ipv4/af_inet.c::inet_accept() {
105 -------------------------
106
107 // Socket in state SYN_RECEIVED
108
109 // Receive packet
110
111 //=net/core/dev.c::net_rx_action() {
112 //=net/core/dev.c::netif_receive_skb() {
113
114 //=net/ipv4/ip_input.c:ip_rcv() {
115  
116  //=net/ipv4/ipv_input.c:ip_rcv_finish() {
117
118   //=net/ipv4/ip_input.c:ip_local_deliver() {
119
120    //=net/ipv4/ip_input.c:ip_local_deliver_finish() {
121     ret = ipprot->handler(skb);
122     //=net/ipv4/af_inet.c::struct net_protocol tcp_protocol->handler
123
124      //=net/ipv4/tcp_ipv4.c::tcp_v4_rcv() {
125
126           TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4);
127
128           if (!sock_owned_by_user(sk)) {
129            {
130             if (!tcp_prequeue(sk, skb))
131                  ret = tcp_v4_do_rcv(sk, skb);
132            }
133           } else
134            sk_add_backlog(sk, skb);
135
136       //=net/ipv4/tcp_ipv4.c::tcp_v4_do_rcv() {
137
138            if (sk->sk_state == TCP_LISTEN) {
139                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
140                 if (!nsk)
141                  goto discard;
142                 // ...
143        }
144
145           // ******* TCP_ESTABLISHED is set at the end of the function, so skip down to avoid getting a headache! *****
146
147        //=net/ipv4/tcp_ipv4.c::tcp_v4_hnd_req() {
148
149         /* Find possible connection requests. */
150          struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,iph->saddr, iph->daddr);
151          if (req)
152               return tcp_check_req(sk, skb, req, prev);
153
154           //=net/ipv4/tcp_minisocks.c::tcp_check_req() {
155            /* 
156             *   Process an incoming packet for SYN_RECV sockets represented
157             *   as a request_sock.
158             */
159
160            /* ACK sequence verified above, just make sure ACK is
161             * set.  If ACK not set, just silently drop the packet.
162             */
163            if (!(flg & TCP_FLAG_ACK))
164             return NULL;
165
166            /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
167            if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
168                TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
169             inet_rsk(req)->acked = 1;
170             return NULL;
171            }
172
173                    /* OK, ACK is valid, create big socket and
174                     * feed this segment to it. It will repeat all
175                     * the tests. THIS SEGMENT MUST MOVE SOCKET TO
176                     * ESTABLISHED STATE. If it will be dropped after
177                     * socket is created, wait for troubles.
178                     */
179                    child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
180                    
181            //=net/ipv4/tcp_ipv4.c::struct inet_connection_sock_af_ops ipv4_specific->syn_recv_sock
182            //=net/ipv4/tcp_ipv4.c::tcp_v4_syn_recv_sock() {
183                         /*
184                          * The three way handshake has completed - we got a valid synack -
185                          * now create the new socket.
186                          */
187
188                         if (sk_acceptq_is_full(sk))
189                                 goto exit_overflow;
190                 
191                         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
192                                 goto exit;
193                 
194                 // **************************** PROMOTION to FULL SOCKET ************************
195                         newsk = tcp_create_openreq_child(sk, req, skb);
196                         
197                         //=net/ipv4/tcp_minisocks.c::tcp_create_openreq_child() {
198                         
199                          tcp_set_ca_state(newsk, TCP_CA_Open);                   
200                          //=include/net/tcp.h::tcp_set_ca_state() {
201                           
202                           //=include/linux/tcp.h::enum tcp_ca_state {
203                            TCP_CA_Open = 0,
204                            #define TCPF_CA_Open (1<<TCP_CA_Open)
205                            TCP_CA_Disorder = 1,
206                            #define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
207                            TCP_CA_CWR = 2,
208                            #define TCPF_CA_CWR  (1<<TCP_CA_CWR)
209                            TCP_CA_Recovery = 3,
210                            #define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
211                            TCP_CA_Loss = 4
212                            #define TCPF_CA_Loss (1<<TCP_CA_Loss)
213                           
214                           }; // =include/linux/tcp.h::enum tcp_ca_state
215                          
216                           struct inet_connection_sock *icsk = inet_csk(sk);
217                           
218                           //=include/net/inet_connection_sock.h::struct inet_connection_sock->icsk_ca_ops
219                            
220                            //=include/net/tcp.h::struct tcp_congestion_ops
221                             /*
222                              * Interface for adding new TCP congestion control handlers
223                                  */
224                             
225                             /* call before changing ca_state (optional) */
226                 void (*set_state)(struct sock *sk, u8 new_state);
227
228                            
229                            }//=include/net/tcp.h::struct tcp_congestion_ops
230                           
231                           }//=include/net/inet_connection_sock.h::struct inet_connection_sock
232                           
233                   if (icsk->icsk_ca_ops->set_state)
234                icsk->icsk_ca_ops->set_state(sk, ca_state);
235                       
236                        //=net/ipv4/tcp_bic.c::bictcp_state() {
237                        
238                         if (new_state == TCP_CA_Loss)
239                                  bictcp_reset(inet_csk_ca(sk));
240                        
241                        } //=net/ipv4/tcp_bic.c::bictcp_state()
242                        
243                       icsk->icsk_ca_state = ca_state;
244          
245                          } //=include/net/tcp.h::tcp_set_ca_state()
246                          
247                         } //=net/ipv4/tcp_minisocks.c::tcp_create_openreq_child()
248                         
249                         if (!newsk)
250                                 goto exit;
251                 
252                         newsk->sk_gso_type = SKB_GSO_TCPV4;
253                         sk_setup_caps(newsk, dst);
254
255                         __inet_hash(&tcp_hashinfo, newsk, 0);
256                         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
257                 
258                         return newsk;
259                 
260                    exit_overflow:
261                         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
262                    exit:
263                         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
264                         dst_release(dst);
265                         return NULL;
266
267                    } //=net/ipv4/tcp_ipv4.c::tcp_v4_syn_recv_sock()
268
269                    if (child == NULL)
270                         goto listen_overflow;
271
272                    inet_csk_reqsk_queue_unlink(sk, req, prev);
273                    inet_csk_reqsk_queue_removed(sk, req);
274
275                    inet_csk_reqsk_queue_add(sk, req, child);
276                    return child;            
277
278                   listen_overflow:
279                    if (!sysctl_tcp_abort_on_overflow) {
280                         inet_rsk(req)->acked = 1;
281                         return NULL;
282                    }
283         
284                   embryonic_reset:
285                    NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
286                    if (!(flg & TCP_FLAG_RST))
287                         req->rsk_ops->send_reset(sk, skb);
288         
289                    inet_csk_reqsk_queue_drop(sk, req, prev);
290                    return NULL;
291
292           } //=net/ipv4/tcp_minisocks.c::tcp_check_req()
293        } //=net/ipv4/tcp_ipv4.c::tcp_v4_hnd_req()
294
295            TCP_CHECK_TIMER(sk);
296            if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
297
298        //=net/ipv4/tcp_input::tcp_rcv_state_process(sock*, sk_buff*, tcphdr*, unsigned) {
299
300             /* step 5: check the ACK field */
301             if (th->ack) {
302              int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
303
304             switch(sk->sk_state) {
305              case TCP_SYN_RECV:
306               if (acceptable) {
307                tp->copied_seq = tp->rcv_nxt;
308                mb();
309
310                // **************************** FINALLY TCP_ESTABLISHED ************************
311                tcp_set_state(sk, TCP_ESTABLISHED);
312                // *****************************************************************************
313                
314                sk->sk_state_change(sk);
315
316                // ...
317                
318               } else {
319                return 1
320               }
321              break;
322             }
323
324        } //=net/ipv4/tcp_input::tcp_rcv_state_process()
325
326                 goto reset;
327            TCP_CHECK_TIMER(sk);
328            return 0;
329
330       } //=net/ipv4/tcp_ipv4.c::tcp_v4_do_rcv()
331      } //=net/ipv4/tcp_ipv4.c::tcp_v4_rcv()
332
333    } //=net/ipv4/ip_input.c:ip_local_deliver_finish()
334
335    return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL, ip_local_deliver_finish);
336
337   } //=net/ipv4/ip_input.c:ip_local_deliver()
338  } //=net/ipv4/ipv_input.c:ip_rcv_finish()
339
340  return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish);
341
342 } //=net/ipv4/ip_input.c:ip_rcv()
343 // Socket in state ESTABLISHED
344
345
346 /* Notes
347
348 See Apache directive for TCP_DEFER_ACCEPT http://httpd.apache.org/docs/trunk/mod/core.html#acceptfilter
349
350 man 7 tcp:
351         SOCKET OPTIONS
352        To set or get a TCP socket option, call getsockopt(2) to read or setsockopt(2) to write
353    
354        TCP_DEFER_ACCEPT
355               Allows a listener to be awakened only when data arrives on the socket.  Takes an integer value (seconds), this can bound the max‐
356               imum number of attempts TCP will make to complete the connection.  This option  should  not  be  used  in  code  intended  to  be
357               portable.
358
359 */