From 54ba1239e7e54ec4ec0b6c0f1c9415e1e9d89574 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Tue, 14 Feb 2023 11:11:54 +0800 Subject: [PATCH 1/2] bpf: Add ipip6 and ip6ip decap support for bpf_skb_adjust_room() maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6EW1Q CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=d219df60a70ed0739aa5dd34b477763311fc5a7b -------------------------------- Add ipip6 and ip6ip decap support for bpf_skb_adjust_room(). Main use case is for using cls_bpf on ingress hook to decapsulate IPv4 over IPv6 and IPv6 over IPv4 tunnel packets. Add two new flags BPF_F_ADJ_ROOM_DECAP_L3_IPV{4,6} to indicate the new IP header version after decapsulating the outer IP header. Suggested-by: Willem de Bruijn Signed-off-by: Ziyang Xuan Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/b268ec7f0ff9431f4f43b1b40ab856ebb28cb4e1.1673574419.git.william.xuanziyang@huawei.com Signed-off-by: Martin KaFai Lau Conflicts: include/uapi/linux/bpf.h tools/include/uapi/linux/bpf.h Signed-off-by: Ziyang Xuan --- include/uapi/linux/bpf.h | 7 +++++++ net/core/filter.c | 31 ++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 7 +++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f39d3bcaf3de..153bad806860 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1781,6 +1781,11 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -4370,6 +4375,8 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index 727178a4f5e9..95de1cd79d15 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3407,12 +3407,16 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) +#define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ - BPF_ADJ_ROOM_ENCAP_L2_MASK)) + BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ + BPF_F_ADJ_ROOM_DECAP_L3_MASK) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) @@ -3518,6 +3522,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_DECAP_L3_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; @@ -3536,6 +3541,14 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, if (unlikely(ret < 0)) return ret; + /* Match skb->protocol to new outer l3 protocol */ + if (skb->protocol == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + skb->protocol = htons(ETH_P_IPV6); + else if (skb->protocol == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) + skb->protocol = htons(ETH_P_IP); + if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -3626,6 +3639,22 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOTSUPP; } + if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + if (!shrink) + return -EINVAL; + + switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { + case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: + len_min = sizeof(struct iphdr); + break; + case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: + len_min = sizeof(struct ipv6hdr); + break; + default: + return -EINVAL; + } + } + len_cur = skb->len - skb_network_offset(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3c7fc4805b78..a6017f2f64f5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2491,6 +2491,11 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -5081,6 +5086,8 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), }; enum { -- Gitee From 172bfbd2c137958f0f9bf895e2b39f4eae476abc Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Tue, 14 Feb 2023 11:11:55 +0800 Subject: [PATCH 2/2] selftests/bpf: add ipip6 and ip6ip decap to test_tc_tunnel maillist inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I6EW1Q CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=7105f76fb56f5ed66a59bc048bc71e9f100e1d39 -------------------------------- Add ipip6 and ip6ip decap testcases. Verify that bpf_skb_adjust_room() correctly decapsulate ipip6 and ip6ip tunnel packets. Signed-off-by: Ziyang Xuan Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/dfd2d8cfdf9111bd129170d4345296f53bee6a67.1673574419.git.william.xuanziyang@huawei.com Signed-off-by: Martin KaFai Lau Conflicts: tools/testing/selftests/bpf/progs/test_tc_tunnel.c tools/testing/selftests/bpf/test_tc_tunnel.sh Signed-off-by: Ziyang Xuan --- .../selftests/bpf/progs/test_tc_tunnel.c | 91 ++++++++++++++++++- tools/testing/selftests/bpf/test_tc_tunnel.sh | 15 +-- 2 files changed, 98 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 37bce7a7c394..f969f953a4a5 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -28,6 +28,10 @@ static const int cfg_udp_src = 20000; #define MPLS_OVER_UDP_PORT 6635 #define ETH_OVER_UDP_PORT 7777 +#ifndef NEXTHDR_DEST +#define NEXTHDR_DEST 60 +#endif + /* MPLS label 1000 with S bit (last label) set and ttl of 255. */ static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 | MPLS_LS_S_MASK | 0xff); @@ -309,6 +313,61 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, return TC_ACT_OK; } +static int encap_ipv6_ipip6(struct __sk_buff *skb) +{ + struct iphdr iph_inner; + struct v6hdr h_outer; + struct tcphdr tcph; + struct ethhdr eth; + __u64 flags; + int olen; + + if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner, + sizeof(iph_inner)) < 0) + return TC_ACT_OK; + + /* filter only packets we want */ + if (bpf_skb_load_bytes(skb, ETH_HLEN + (iph_inner.ihl << 2), + &tcph, sizeof(tcph)) < 0) + return TC_ACT_OK; + + if (tcph.dest != __bpf_constant_htons(cfg_port)) + return TC_ACT_OK; + + olen = sizeof(h_outer.ip); + + flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6; + + /* add room between mac and network header */ + if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags)) + return TC_ACT_SHOT; + + /* prepare new outer network header */ + memset(&h_outer.ip, 0, sizeof(h_outer.ip)); + h_outer.ip.version = 6; + h_outer.ip.hop_limit = iph_inner.ttl; + h_outer.ip.saddr.s6_addr[1] = 0xfd; + h_outer.ip.saddr.s6_addr[15] = 1; + h_outer.ip.daddr.s6_addr[1] = 0xfd; + h_outer.ip.daddr.s6_addr[15] = 2; + h_outer.ip.payload_len = iph_inner.tot_len; + h_outer.ip.nexthdr = IPPROTO_IPIP; + + /* store new outer network header */ + if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen, + BPF_F_INVALIDATE_HASH) < 0) + return TC_ACT_SHOT; + + /* update eth->h_proto */ + if (bpf_skb_load_bytes(skb, 0, ð, sizeof(eth)) < 0) + return TC_ACT_SHOT; + eth.h_proto = bpf_htons(ETH_P_IPV6); + if (bpf_skb_store_bytes(skb, 0, ð, sizeof(eth), 0) < 0) + return TC_ACT_SHOT; + + return TC_ACT_OK; +} + SEC("encap_ipip_none") int __encap_ipip_none(struct __sk_buff *skb) { @@ -390,6 +449,15 @@ int __encap_ip6tnl_none(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("encap_ipip6_none") +int __encap_ipip6_none(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) + return encap_ipv6_ipip6(skb); + else + return TC_ACT_OK; +} + SEC("encap_ip6gre_none") int __encap_ip6gre_none(struct __sk_buff *skb) { @@ -447,13 +515,33 @@ int __encap_ip6udp_eth(struct __sk_buff *skb) static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) { char buf[sizeof(struct v6hdr)]; + __u64 flags = BPF_F_ADJ_ROOM_FIXED_GSO; + struct ipv6_opt_hdr ip6_opt_hdr; struct gre_hdr greh; struct udphdr udph; int olen = len; switch (proto) { case IPPROTO_IPIP: + flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4; + break; case IPPROTO_IPV6: + flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6; + break; + case NEXTHDR_DEST: + if (bpf_skb_load_bytes(skb, off + len, &ip6_opt_hdr, + sizeof(ip6_opt_hdr)) < 0) + return TC_ACT_OK; + switch (ip6_opt_hdr.nexthdr) { + case IPPROTO_IPIP: + flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4; + break; + case IPPROTO_IPV6: + flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6; + break; + default: + return TC_ACT_OK; + } break; case IPPROTO_GRE: olen += sizeof(struct gre_hdr); @@ -485,8 +573,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) return TC_ACT_OK; } - if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, - BPF_F_ADJ_ROOM_FIXED_GSO)) + if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, flags)) return TC_ACT_SHOT; return TC_ACT_OK; diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 7c76b841b17b..f897ecc694f6 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -99,6 +99,9 @@ if [[ "$#" -eq "0" ]]; then echo "ipip" $0 ipv4 ipip none 100 + echo "ipip6" + $0 ipv4 ipip6 none 100 + echo "ip6ip6" $0 ipv6 ip6tnl none 100 @@ -214,6 +217,9 @@ if [[ "$tuntype" =~ "udp" ]]; then targs="encap fou encap-sport auto encap-dport $dport" elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then ttype=$gretaptype +elif [[ "$tuntype" == "ipip6" ]]; then + ttype="ip6tnl" + targs="" else ttype=$tuntype targs="" @@ -223,6 +229,9 @@ fi if [[ "${tuntype}" == "sit" ]]; then link_addr1="${ns1_v4}" link_addr2="${ns2_v4}" +elif [[ "${tuntype}" == "ipip6" ]]; then + link_addr1="${ns1_v6}" + link_addr2="${ns2_v6}" else link_addr1="${addr1}" link_addr2="${addr2}" @@ -277,12 +286,6 @@ else server_listen fi -# bpf_skb_net_shrink does not take tunnel flags yet, cannot update L3. -if [[ "${tuntype}" == "sit" ]]; then - echo OK - exit 0 -fi - # serverside, use BPF for decap ip netns exec "${ns2}" ip link del dev testtun0 ip netns exec "${ns2}" tc qdisc add dev veth2 clsact -- Gitee