I am asking for your help regarding an error I have been experiencing for two weeks now.
I am working on a NAT64 implemented in XDP using BCC, the exchange between nodes works, my sender node sends the ping and the receiver node receives the ping, in between the two nodes there is another node called router where there is attached the XDP program. In one interface it translates from ipv6 to ipv4 and another interface does ipv4 to ipv6.
The node are namespaces.
The problem occurs when the destination node receives the packet, the ICMP checksum I see from tcpdump is wrong (the ip checksum is not checked so I don't know if that one is right).
I've tried so many different approaches to calculate the checksum that I found scattered around the internet but none of them worked, I'm starting to think that maybe the error might be somewhere else? So I've been checking and rechecking the code these past two weeks but can't find anything wrong.
This is my output from tcpdump -i veth1 -vvv:
16:10:45.883038 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.9.1 > archiz-linux: ICMP echo request, id 47244, seq 1, length 64 (wrong icmp cksum 3f72 (->6fc9)!)
I tried to disable the checksum offload with: ethtool -K veth rx off tx off for every veths in every namespaces but I have the same previous output. Maybe it didn't work because this approch don't cover the icmp checksum like this post says.
The checksum calculation function:
csum_fold_helper()
static inline __u16 csum_fold_helper(__u64 csum)
{
__u32 sum;
sum = (csum >> 16) + (csum & 0xffff);
sum += (sum >> 16);
return ~sum;
}
And this is my implementation, I omitted parts for simplicity:
int xdp_router_func(struct xdp_md *ctx){
void *data = (void *)(long)ctx->data;
void *data_end = (void *)(long)ctx->data_end;
struct bpf_fib_lookup fib_params = {0};
struct ethhdr eth_cpy;
struct ethhdr *eth = data;
struct iphdr *iph = {0};
__u64 nh_off = sizeof(*eth);
if (data + nh_off > data_end)
return XDP_DROP;
if (eth->h_proto == bpf_htons(ETH_P_IPV6))
{
memcpy(ð_cpy, eth, sizeof(eth_cpy));
struct ipv6hdr *ip6h = data + nh_off;
if (ip6h + 1 > data_end)
return XDP_DROP;
...
struct iphdr dst_hdr = {
.version = 4,
.ihl = 5,
.frag_off = bpf_htons(1 << 14),
};
__u32 assigned_ipv4 = 0;
// search inside the natting_table the ipv6_addr associated to the ipv4
...
dst_hdr.saddr = bpf_htonl((__be32)assigned_ipv4);
dst_hdr.daddr = ip6h->daddr.s6_addr32[3];
dst_hdr.protocol = ip6h->nexthdr;
dst_hdr.ttl = ip6h->hop_limit;
dst_hdr.tos = ip6h->priority << 4 | (ip6h->flow_lbl[0] >> 4);
dst_hdr.tot_len = bpf_htons(bpf_ntohs(ip6h->payload_len) + sizeof(dst_hdr));
// check if the packet is a icmpv6
if (dst_hdr.protocol == IPPROTO_ICMPV6)
{
struct icmp6hdr *icmp6h = (void *)ip6h + sizeof(*ip6h);
if (icmp6h + 1 > data_end)
return XDP_DROP;
// ready to parse the icmpv6 header in icmp
struct icmphdr tmp_icmp;
struct icmphdr *icmp;
// set the right type to icmp, id field and sequence field
if (write_icmp(&tmp_icmp, icmp6h) == -1)
{
bpf_trace_printk("[ERR]: error during icpmv6 parse in icmp");
return XDP_DROP;
}
if (bpf_xdp_adjust_head(ctx, (int)sizeof(*icmp6h) - (int)sizeof(tmp_icmp)))
return XDP_DROP;
// after the adjust head I have to reassign the pointers
data = (void *)(long)ctx->data;
data_end = (void *)(long)ctx->data_end;
icmp = (void *)(data + sizeof(struct ethhdr) + sizeof(struct ipv6hdr));
if (icmp + 1 > data_end)
return XDP_DROP;
*icmp = tmp_icmp;
// set the checksum
icmp->checksum = 0x0000;
icmp->checksum = csum_fold_helper(bpf_csum_diff((__be32 *)icmp, 0, (__be32 *)icmp, sizeof(icmp), 0));
dst_hdr.protocol = IPPROTO_ICMP;
} // icmpv6
dst_hdr.check = csum_fold_helper(bpf_csum_diff((__be32 *)&dst_hdr, 0, (__be32 *)&dst_hdr, sizeof(dst_hdr), 0));
if (bpf_xdp_adjust_head(ctx, (int)sizeof(struct ipv6hdr) - (int)sizeof(struct iphdr)))
return XDP_DROP;
// after the adjust head I have to reassign the pointers
eth = (void *)(long)ctx->data;
data = (void *)(long)ctx->data;
data_end = (void *)(long)ctx->data_end;
if (eth + 1 > data_end)
return XDP_DROP;
memcpy(eth, ð_cpy, sizeof(*eth));
eth->h_proto = bpf_htons(ETH_P_IP);
iph = (void *)(data + sizeof(*eth));
if (iph + 1 > data_end)
{
bpf_trace_printk("iph out of boundary");
return XDP_DROP;
}
*iph = dst_hdr;
// start forwarding
// setting the fib_params
fib_params.family = AF_INET;
fib_params.tos = iph->tos;
fib_params.tot_len = bpf_ntohs(iph->tot_len);
fib_params.ipv4_dst = iph->daddr;
fib_params.ipv4_src = iph->saddr;
fib_params.ifindex = ctx->ingress_ifindex;
fib_params.sport = 0;
fib_params.dport = 0;
} // end ipv6
// forwarding
int rc;
rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), 0);
switch (rc)
{
case BPF_FIB_LKUP_RET_SUCCESS: /* lookup successful */
memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
int action;
action = bpf_redirect(fib_params.ifindex, 0);
return action;
// other cases
...
}
return XDP_PASS;
}
Cilium implements such a checksum calculation, so I'd use that (with proper attribution and copyleft license):
icmp_wsum_accumulateis defined in the sources (and is what I believe requires attribution and GPL-compatible licensing).icmp_payloadandicmp_payload_endshould point to the start and end of your ICMP message payload (typically the original IP packet).