diff --git a/examples/bpf/bpf_agent.c b/examples/bpf/bpf_agent.c deleted file mode 100644 index f9b9ce3c..00000000 --- a/examples/bpf/bpf_agent.c +++ /dev/null @@ -1,258 +0,0 @@ -/* - * eBPF user space agent part - * - * Simple, _self-contained_ user space agent for the eBPF kernel - * ebpf_prog.c program, which gets all map fds passed from tc via unix - * domain socket in one transaction and can thus keep referencing - * them from user space in order to read out (or possibly modify) - * map data. Here, just as a minimal example to display counters. - * - * The agent only uses the bpf(2) syscall API to read or possibly - * write to eBPF maps, it doesn't need to be aware of the low-level - * bytecode parts and/or ELF parsing bits. - * - * ! For more details, see header comment in bpf_prog.c ! - * - * gcc bpf_agent.c -o bpf_agent -Wall -O2 - * - * For example, a more complex user space agent could run on each - * host, reading and writing into eBPF maps used by tc classifier - * and actions. It would thus allow for implementing a distributed - * tc architecture, for example, which would push down central - * policies into eBPF maps, and thus altering run-time behaviour. - * - * -- Happy eBPF hacking! ;) - */ - -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Just some misc macros as min(), offsetof(), etc. */ -#include "../../include/utils.h" -/* Common code from fd passing. */ -#include "../../include/bpf_scm.h" -/* Common, shared definitions with ebpf_prog.c */ -#include "bpf_shared.h" -/* Mini syscall wrapper */ -#include "bpf_sys.h" - -static void bpf_dump_drops(int fd) -{ - int cpu, max; - - max = sysconf(_SC_NPROCESSORS_ONLN); - - printf(" `- number of drops:"); - for (cpu = 0; cpu < max; cpu++) { - long drops; - - assert(bpf_lookup_elem(fd, &cpu, &drops) == 0); - printf("\tcpu%d: %5ld", cpu, drops); - } - printf("\n"); -} - -static void bpf_dump_queue(int fd) -{ - /* Just for the same of the example. */ - int max_queue = 4, i; - - printf(" | nic queues:"); - for (i = 0; i < max_queue; i++) { - struct count_queue cq; - int ret; - - memset(&cq, 0, sizeof(cq)); - ret = bpf_lookup_elem(fd, &i, &cq); - assert(ret == 0 || (ret < 0 && errno == ENOENT)); - - printf("\tq%d:[pkts: %ld, mis: %ld]", - i, cq.total, cq.mismatch); - } - printf("\n"); -} - -static void bpf_dump_proto(int fd) -{ - uint8_t protos[] = { IPPROTO_TCP, IPPROTO_UDP, IPPROTO_ICMP }; - char *names[] = { "tcp", "udp", "icmp" }; - int i; - - printf(" ` protos:"); - for (i = 0; i < ARRAY_SIZE(protos); i++) { - struct count_tuple ct; - int ret; - - memset(&ct, 0, sizeof(ct)); - ret = bpf_lookup_elem(fd, &protos[i], &ct); - assert(ret == 0 || (ret < 0 && errno == ENOENT)); - - printf("\t%s:[pkts: %ld, bytes: %ld]", - names[i], ct.packets, ct.bytes); - } - printf("\n"); -} - -static void bpf_dump_map_data(int *tfd) -{ - int i; - - for (i = 0; i < 30; i++) { - const int period = 5; - - printf("data, period: %dsec\n", period); - - bpf_dump_drops(tfd[BPF_MAP_ID_DROPS]); - bpf_dump_queue(tfd[BPF_MAP_ID_QUEUE]); - bpf_dump_proto(tfd[BPF_MAP_ID_PROTO]); - - sleep(period); - } -} - -static void bpf_info_loop(int *fds, struct bpf_map_aux *aux) -{ - int i, tfd[BPF_MAP_ID_MAX]; - - printf("ver: %d\nobj: %s\ndev: %lu\nino: %lu\nmaps: %u\n", - aux->uds_ver, aux->obj_name, aux->obj_st.st_dev, - aux->obj_st.st_ino, aux->num_ent); - - for (i = 0; i < aux->num_ent; i++) { - printf("map%d:\n", i); - printf(" `- fd: %u\n", fds[i]); - printf(" | serial: %u\n", aux->ent[i].id); - printf(" | type: %u\n", aux->ent[i].type); - printf(" | max elem: %u\n", aux->ent[i].max_elem); - printf(" | size key: %u\n", aux->ent[i].size_key); - printf(" ` size val: %u\n", aux->ent[i].size_value); - - tfd[aux->ent[i].id] = fds[i]; - } - - bpf_dump_map_data(tfd); -} - -static void bpf_map_get_from_env(int *tfd) -{ - char key[64], *val; - int i; - - for (i = 0; i < BPF_MAP_ID_MAX; i++) { - memset(key, 0, sizeof(key)); - snprintf(key, sizeof(key), "BPF_MAP%d", i); - - val = getenv(key); - assert(val != NULL); - - tfd[i] = atoi(val); - } -} - -static int bpf_map_set_recv(int fd, int *fds, struct bpf_map_aux *aux, - unsigned int entries) -{ - struct bpf_map_set_msg msg; - int *cmsg_buf, min_fd, i; - char *amsg_buf, *mmsg_buf; - - cmsg_buf = bpf_map_set_init(&msg, NULL, 0); - amsg_buf = (char *)msg.aux.ent; - mmsg_buf = (char *)&msg.aux; - - for (i = 0; i < entries; i += min_fd) { - struct cmsghdr *cmsg; - int ret; - - min_fd = min(BPF_SCM_MAX_FDS * 1U, entries - i); - - bpf_map_set_init_single(&msg, min_fd); - - ret = recvmsg(fd, &msg.hdr, 0); - if (ret <= 0) - return ret ? : -1; - - cmsg = CMSG_FIRSTHDR(&msg.hdr); - if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS) - return -EINVAL; - if (msg.hdr.msg_flags & MSG_CTRUNC) - return -EIO; - - min_fd = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof(fd); - if (min_fd > entries || min_fd <= 0) - return -1; - - memcpy(&fds[i], cmsg_buf, sizeof(fds[0]) * min_fd); - memcpy(&aux->ent[i], amsg_buf, sizeof(aux->ent[0]) * min_fd); - memcpy(aux, mmsg_buf, offsetof(struct bpf_map_aux, ent)); - - if (i + min_fd == aux->num_ent) - break; - } - - return 0; -} - -int main(int argc, char **argv) -{ - int fds[BPF_SCM_MAX_FDS]; - struct bpf_map_aux aux; - struct sockaddr_un addr; - int fd, ret, i; - - /* When arguments are being passed, we take it as a path - * to a Unix domain socket, otherwise we grab the fds - * from the environment to demonstrate both possibilities. - */ - if (argc == 1) { - int tfd[BPF_MAP_ID_MAX]; - - bpf_map_get_from_env(tfd); - bpf_dump_map_data(tfd); - - return 0; - } - - fd = socket(AF_UNIX, SOCK_DGRAM, 0); - if (fd < 0) { - fprintf(stderr, "Cannot open socket: %s\n", - strerror(errno)); - exit(1); - } - - memset(&addr, 0, sizeof(addr)); - addr.sun_family = AF_UNIX; - strncpy(addr.sun_path, argv[argc - 1], sizeof(addr.sun_path)); - - ret = bind(fd, (struct sockaddr *)&addr, sizeof(addr)); - if (ret < 0) { - fprintf(stderr, "Cannot bind to socket: %s\n", - strerror(errno)); - exit(1); - } - - memset(fds, 0, sizeof(fds)); - memset(&aux, 0, sizeof(aux)); - - ret = bpf_map_set_recv(fd, fds, &aux, BPF_SCM_MAX_FDS); - if (ret >= 0) - bpf_info_loop(fds, &aux); - - for (i = 0; i < aux.num_ent; i++) - close(fds[i]); - - close(fd); - return 0; -} diff --git a/examples/bpf/bpf_map_in_map.c b/examples/bpf/bpf_map_in_map.c new file mode 100644 index 00000000..ff0e623a --- /dev/null +++ b/examples/bpf/bpf_map_in_map.c @@ -0,0 +1,56 @@ +#include "../../include/bpf_api.h" + +#define MAP_INNER_ID 42 + +struct bpf_elf_map __section_maps map_inner = { + .type = BPF_MAP_TYPE_ARRAY, + .size_key = sizeof(uint32_t), + .size_value = sizeof(uint32_t), + .id = MAP_INNER_ID, + .inner_idx = 0, + .pinning = PIN_GLOBAL_NS, + .max_elem = 1, +}; + +struct bpf_elf_map __section_maps map_outer = { + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, + .size_key = sizeof(uint32_t), + .size_value = sizeof(uint32_t), + .inner_id = MAP_INNER_ID, + .pinning = PIN_GLOBAL_NS, + .max_elem = 1, +}; + +__section("egress") +int emain(struct __sk_buff *skb) +{ + struct bpf_elf_map *map_inner; + int key = 0, *val; + + map_inner = map_lookup_elem(&map_outer, &key); + if (map_inner) { + val = map_lookup_elem(map_inner, &key); + if (val) + lock_xadd(val, 1); + } + + return BPF_H_DEFAULT; +} + +__section("ingress") +int imain(struct __sk_buff *skb) +{ + struct bpf_elf_map *map_inner; + int key = 0, *val; + + map_inner = map_lookup_elem(&map_outer, &key); + if (map_inner) { + val = map_lookup_elem(map_inner, &key); + if (val) + printt("map val: %d\n", *val); + } + + return BPF_H_DEFAULT; +} + +BPF_LICENSE("GPL"); diff --git a/examples/bpf/bpf_prog.c b/examples/bpf/bpf_prog.c deleted file mode 100644 index d6caf374..00000000 --- a/examples/bpf/bpf_prog.c +++ /dev/null @@ -1,501 +0,0 @@ -/* - * eBPF kernel space program part - * - * Toy eBPF program for demonstration purposes, some parts derived from - * kernel tree's samples/bpf/sockex2_kern.c example. - * - * More background on eBPF, kernel tree: Documentation/networking/filter.txt - * - * Note, this file is rather large, and most classifier and actions are - * likely smaller to accomplish one specific use-case and are tailored - * for high performance. For performance reasons, you might also have the - * classifier and action already merged inside the classifier. - * - * In order to show various features it serves as a bigger programming - * example, which you should feel free to rip apart and experiment with. - * - * Compilation, configuration example: - * - * Note: as long as the BPF backend in LLVM is still experimental, - * you need to build LLVM with LLVM with --enable-experimental-targets=BPF - * Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y, - * and you have libelf.h and gelf.h headers and can link tc against -lelf. - * - * In case you need to sync kernel headers, go to your kernel source tree: - * # make headers_install INSTALL_HDR_PATH=/usr/ - * - * $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH - * $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o - * $ objdump -h bpf.o - * [...] - * 3 classifier 000007f8 0000000000000000 0000000000000000 00000040 2**3 - * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE - * 4 action-mark 00000088 0000000000000000 0000000000000000 00000838 2**3 - * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE - * 5 action-rand 00000098 0000000000000000 0000000000000000 000008c0 2**3 - * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE - * 6 maps 00000030 0000000000000000 0000000000000000 00000958 2**2 - * CONTENTS, ALLOC, LOAD, DATA - * 7 license 00000004 0000000000000000 0000000000000000 00000988 2**0 - * CONTENTS, ALLOC, LOAD, DATA - * [...] - * # echo 1 > /proc/sys/net/core/bpf_jit_enable - * $ gcc bpf_agent.c -o bpf_agent -Wall -O2 - * # ./bpf_agent /tmp/bpf-uds (e.g. on a different terminal) - * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \ - * action bpf obj bpf.o sec action-mark \ - * action bpf obj bpf.o sec action-rand ok - * # tc filter show dev em1 - * filter parent 1: protocol all pref 49152 bpf - * filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier] - * action order 1: bpf bpf.o:[action-mark] default-action pipe - * index 52 ref 1 bind 1 - * - * action order 2: bpf bpf.o:[action-rand] default-action pipe - * index 53 ref 1 bind 1 - * - * action order 3: gact action pass - * random type none pass val 0 - * index 38 ref 1 bind 1 - * - * The same program can also be installed on ingress side (as opposed to above - * egress configuration), e.g.: - * - * # tc qdisc add dev em1 handle ffff: ingress - * # tc filter add dev em1 parent ffff: bpf obj ... - * - * Notes on BPF agent: - * - * In the above example, the bpf_agent creates the unix domain socket - * natively. "tc exec" can also spawn a shell and hold the socktes there: - * - * # tc exec bpf imp /tmp/bpf-uds - * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \ - * action bpf obj bpf.o sec action-mark \ - * action bpf obj bpf.o sec action-rand ok - * sh-4.2# (shell spawned from tc exec) - * sh-4.2# bpf_agent - * [...] - * - * This will read out fds over environment and produce the same data dump - * as below. This has the advantage that the spawned shell owns the fds - * and thus if the agent is restarted, it can reattach to the same fds, also - * various programs can easily read/modify the data simultaneously from user - * space side. - * - * If the shell is unnecessary, the agent can also just be spawned directly - * via tc exec: - * - * # tc exec bpf imp /tmp/bpf-uds run bpf_agent - * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \ - * action bpf obj bpf.o sec action-mark \ - * action bpf obj bpf.o sec action-rand ok - * - * BPF agent example output: - * - * ver: 1 - * obj: bpf.o - * dev: 64770 - * ino: 6045133 - * maps: 3 - * map0: - * `- fd: 4 - * | serial: 1 - * | type: 1 - * | max elem: 256 - * | size key: 1 - * ` size val: 16 - * map1: - * `- fd: 5 - * | serial: 2 - * | type: 1 - * | max elem: 1024 - * | size key: 4 - * ` size val: 16 - * map2: - * `- fd: 6 - * | serial: 3 - * | type: 2 - * | max elem: 64 - * | size key: 4 - * ` size val: 8 - * data, period: 5sec - * `- number of drops: cpu0: 0 cpu1: 0 cpu2: 0 cpu3: 0 - * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 0, mis: 0] q3:[pkts: 0, mis: 0] - * ` protos: tcp:[pkts: 0, bytes: 0] udp:[pkts: 0, bytes: 0] icmp:[pkts: 0, bytes: 0] - * data, period: 5sec - * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 0 cpu3: 1 - * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 24, mis: 14] q3:[pkts: 0, mis: 0] - * ` protos: tcp:[pkts: 13, bytes: 1989] udp:[pkts: 10, bytes: 710] icmp:[pkts: 0, bytes: 0] - * data, period: 5sec - * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 3 cpu3: 3 - * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 39, mis: 21] q3:[pkts: 0, mis: 0] - * ` protos: tcp:[pkts: 20, bytes: 3549] udp:[pkts: 18, bytes: 1278] icmp:[pkts: 0, bytes: 0] - * [...] - * - * This now means, the below classifier and action pipeline has been loaded - * as eBPF bytecode into the kernel, the kernel has verified that the - * execution of the bytecode is "safe", and it has JITed the programs - * afterwards, so that upon invocation they're running on native speed. tc - * has transferred all map file descriptors to the bpf_agent via IPC and - * even after tc exits, the agent can read out or modify all map data. - * - * Note that the export to the uds is done only once in the classifier and - * not in the action. It's enough to export the (here) shared descriptors - * once. - * - * If you need to disassemble the generated JIT image (echo with 2), the - * kernel tree has under tools/net/ a small helper, you can invoke e.g. - * `bpf_jit_disasm -o`. - * - * Please find in the code below further comments. - * - * -- Happy eBPF hacking! ;) - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Common, shared definitions with ebpf_agent.c. */ -#include "bpf_shared.h" -/* BPF helper functions for our example. */ -#include "../../include/bpf_api.h" - -/* Could be defined here as well, or included from the header. */ -#define TC_ACT_UNSPEC (-1) -#define TC_ACT_OK 0 -#define TC_ACT_RECLASSIFY 1 -#define TC_ACT_SHOT 2 -#define TC_ACT_PIPE 3 -#define TC_ACT_STOLEN 4 -#define TC_ACT_QUEUED 5 -#define TC_ACT_REPEAT 6 - -/* Other, misc stuff. */ -#define IP_MF 0x2000 -#define IP_OFFSET 0x1FFF - -/* eBPF map definitions, all placed in section "maps". */ -struct bpf_elf_map __section("maps") map_proto = { - .type = BPF_MAP_TYPE_HASH, - .id = BPF_MAP_ID_PROTO, - .size_key = sizeof(uint8_t), - .size_value = sizeof(struct count_tuple), - .max_elem = 256, - .flags = BPF_F_NO_PREALLOC, -}; - -struct bpf_elf_map __section("maps") map_queue = { - .type = BPF_MAP_TYPE_HASH, - .id = BPF_MAP_ID_QUEUE, - .size_key = sizeof(uint32_t), - .size_value = sizeof(struct count_queue), - .max_elem = 1024, - .flags = BPF_F_NO_PREALLOC, -}; - -struct bpf_elf_map __section("maps") map_drops = { - .type = BPF_MAP_TYPE_ARRAY, - .id = BPF_MAP_ID_DROPS, - .size_key = sizeof(uint32_t), - .size_value = sizeof(long), - .max_elem = 64, -}; - -/* Helper functions and definitions for the flow dissector used by the - * example classifier. This resembles the kernel's flow dissector to - * some extend and is just used as an example to show what's possible - * with eBPF. - */ -struct sockaddr; - -struct vlan_hdr { - __be16 h_vlan_TCI; - __be16 h_vlan_encapsulated_proto; -}; - -struct flow_keys { - __u32 src; - __u32 dst; - union { - __u32 ports; - __u16 port16[2]; - }; - __s32 th_off; - __u8 ip_proto; -}; - -static __inline__ int flow_ports_offset(__u8 ip_proto) -{ - switch (ip_proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - case IPPROTO_SCTP: - case IPPROTO_UDPLITE: - default: - return 0; - case IPPROTO_AH: - return 4; - } -} - -static __inline__ bool flow_is_frag(struct __sk_buff *skb, int nh_off) -{ - return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) & - (IP_MF | IP_OFFSET)); -} - -static __inline__ int flow_parse_ipv4(struct __sk_buff *skb, int nh_off, - __u8 *ip_proto, struct flow_keys *flow) -{ - __u8 ip_ver_len; - - if (unlikely(flow_is_frag(skb, nh_off))) - *ip_proto = 0; - else - *ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr, - protocol)); - if (*ip_proto != IPPROTO_GRE) { - flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr)); - flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr)); - } - - ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */); - if (likely(ip_ver_len == 0x45)) - nh_off += 20; - else - nh_off += (ip_ver_len & 0xF) << 2; - - return nh_off; -} - -static __inline__ __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off) -{ - __u32 w0 = load_word(skb, off); - __u32 w1 = load_word(skb, off + sizeof(w0)); - __u32 w2 = load_word(skb, off + sizeof(w0) * 2); - __u32 w3 = load_word(skb, off + sizeof(w0) * 3); - - return w0 ^ w1 ^ w2 ^ w3; -} - -static __inline__ int flow_parse_ipv6(struct __sk_buff *skb, int nh_off, - __u8 *ip_proto, struct flow_keys *flow) -{ - *ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr)); - - flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr)); - flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr)); - - return nh_off + sizeof(struct ipv6hdr); -} - -static __inline__ bool flow_dissector(struct __sk_buff *skb, - struct flow_keys *flow) -{ - int poff, nh_off = BPF_LL_OFF + ETH_HLEN; - __be16 proto = skb->protocol; - __u8 ip_proto; - - /* TODO: check for skb->vlan_tci, skb->vlan_proto first */ - if (proto == htons(ETH_P_8021AD)) { - proto = load_half(skb, nh_off + - offsetof(struct vlan_hdr, h_vlan_encapsulated_proto)); - nh_off += sizeof(struct vlan_hdr); - } - if (proto == htons(ETH_P_8021Q)) { - proto = load_half(skb, nh_off + - offsetof(struct vlan_hdr, h_vlan_encapsulated_proto)); - nh_off += sizeof(struct vlan_hdr); - } - - if (likely(proto == htons(ETH_P_IP))) - nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow); - else if (proto == htons(ETH_P_IPV6)) - nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow); - else - return false; - - switch (ip_proto) { - case IPPROTO_GRE: { - struct gre_hdr { - __be16 flags; - __be16 proto; - }; - - __u16 gre_flags = load_half(skb, nh_off + - offsetof(struct gre_hdr, flags)); - __u16 gre_proto = load_half(skb, nh_off + - offsetof(struct gre_hdr, proto)); - - if (gre_flags & (GRE_VERSION | GRE_ROUTING)) - break; - - nh_off += 4; - if (gre_flags & GRE_CSUM) - nh_off += 4; - if (gre_flags & GRE_KEY) - nh_off += 4; - if (gre_flags & GRE_SEQ) - nh_off += 4; - - if (gre_proto == ETH_P_8021Q) { - gre_proto = load_half(skb, nh_off + - offsetof(struct vlan_hdr, - h_vlan_encapsulated_proto)); - nh_off += sizeof(struct vlan_hdr); - } - if (gre_proto == ETH_P_IP) - nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow); - else if (gre_proto == ETH_P_IPV6) - nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow); - else - return false; - break; - } - case IPPROTO_IPIP: - nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow); - break; - case IPPROTO_IPV6: - nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow); - default: - break; - } - - nh_off += flow_ports_offset(ip_proto); - - flow->ports = load_word(skb, nh_off); - flow->th_off = nh_off; - flow->ip_proto = ip_proto; - - return true; -} - -static __inline__ void cls_update_proto_map(const struct __sk_buff *skb, - const struct flow_keys *flow) -{ - uint8_t proto = flow->ip_proto; - struct count_tuple *ct, _ct; - - ct = map_lookup_elem(&map_proto, &proto); - if (likely(ct)) { - lock_xadd(&ct->packets, 1); - lock_xadd(&ct->bytes, skb->len); - return; - } - - /* No hit yet, we need to create a new entry. */ - _ct.packets = 1; - _ct.bytes = skb->len; - - map_update_elem(&map_proto, &proto, &_ct, BPF_ANY); -} - -static __inline__ void cls_update_queue_map(const struct __sk_buff *skb) -{ - uint32_t queue = skb->queue_mapping; - struct count_queue *cq, _cq; - bool mismatch; - - mismatch = skb->queue_mapping != get_smp_processor_id(); - - cq = map_lookup_elem(&map_queue, &queue); - if (likely(cq)) { - lock_xadd(&cq->total, 1); - if (mismatch) - lock_xadd(&cq->mismatch, 1); - return; - } - - /* No hit yet, we need to create a new entry. */ - _cq.total = 1; - _cq.mismatch = mismatch ? 1 : 0; - - map_update_elem(&map_queue, &queue, &_cq, BPF_ANY); -} - -/* eBPF program definitions, placed in various sections, which can - * have custom section names. If custom names are in use, it's - * required to point tc to the correct section, e.g. - * - * tc filter add [...] bpf obj cls.o sec cls-tos [...] - * - * in case the program resides in __section("cls-tos"). - * - * Default section for cls_bpf is: "classifier", for act_bpf is: - * "action". Naturally, if for example multiple actions are present - * in the same file, they need to have distinct section names. - * - * It is however not required to have multiple programs sharing - * a file. - */ -__section("classifier") -int cls_main(struct __sk_buff *skb) -{ - struct flow_keys flow; - - if (!flow_dissector(skb, &flow)) - return 0; /* No match in cls_bpf. */ - - cls_update_proto_map(skb, &flow); - cls_update_queue_map(skb); - - return flow.ip_proto; -} - -static __inline__ void act_update_drop_map(void) -{ - uint32_t *count, cpu = get_smp_processor_id(); - - count = map_lookup_elem(&map_drops, &cpu); - if (count) - /* Only this cpu is accessing this element. */ - (*count)++; -} - -__section("action-mark") -int act_mark_main(struct __sk_buff *skb) -{ - /* You could also mangle skb data here with the helper function - * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could - * do that already in the classifier itself as a merged combination - * of classifier'n'action model. - */ - - if (skb->mark == 0xcafe) { - act_update_drop_map(); - return TC_ACT_SHOT; - } - - /* Default configured tc opcode. */ - return TC_ACT_UNSPEC; -} - -__section("action-rand") -int act_rand_main(struct __sk_buff *skb) -{ - /* Sorry, we're near event horizon ... */ - if ((get_prandom_u32() & 3) == 0) { - act_update_drop_map(); - return TC_ACT_SHOT; - } - - return TC_ACT_UNSPEC; -} - -/* Last but not least, the file contains a license. Some future helper - * functions may only be available with a GPL license. - */ -BPF_LICENSE("GPL"); diff --git a/examples/bpf/bpf_shared.h b/examples/bpf/bpf_shared.h deleted file mode 100644 index a24038dd..00000000 --- a/examples/bpf/bpf_shared.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef __BPF_SHARED__ -#define __BPF_SHARED__ - -enum { - BPF_MAP_ID_PROTO, - BPF_MAP_ID_QUEUE, - BPF_MAP_ID_DROPS, - __BPF_MAP_ID_MAX, -#define BPF_MAP_ID_MAX __BPF_MAP_ID_MAX -}; - -struct count_tuple { - long packets; /* type long for lock_xadd() */ - long bytes; -}; - -struct count_queue { - long total; - long mismatch; -}; - -#endif /* __BPF_SHARED__ */ diff --git a/examples/bpf/bpf_sys.h b/examples/bpf/bpf_sys.h deleted file mode 100644 index 6e4f09e2..00000000 --- a/examples/bpf/bpf_sys.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __BPF_SYS__ -#define __BPF_SYS__ - -#include -#include - -static inline __u64 bpf_ptr_to_u64(const void *ptr) -{ - return (__u64) (unsigned long) ptr; -} - -static inline int bpf_lookup_elem(int fd, void *key, void *value) -{ - union bpf_attr attr = { - .map_fd = fd, - .key = bpf_ptr_to_u64(key), - .value = bpf_ptr_to_u64(value), - }; - - return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); -} - -#endif /* __BPF_SYS__ */ diff --git a/include/bpf_elf.h b/include/bpf_elf.h index 239a0f36..406c3087 100644 --- a/include/bpf_elf.h +++ b/include/bpf_elf.h @@ -36,6 +36,8 @@ struct bpf_elf_map { __u32 flags; __u32 id; __u32 pinning; + __u32 inner_id; + __u32 inner_idx; }; #endif /* __BPF_ELF__ */ diff --git a/include/bpf_util.h b/include/bpf_util.h index 5361dab1..6582ec8c 100644 --- a/include/bpf_util.h +++ b/include/bpf_util.h @@ -261,6 +261,8 @@ int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns, int bpf_prog_attach_fd(int prog_fd, int target_fd, enum bpf_attach_type type); int bpf_prog_detach_fd(int target_fd, enum bpf_attach_type type); +void bpf_dump_prog_info(FILE *f, uint32_t id); + #ifdef HAVE_ELF int bpf_send_map_fds(const char *path, const char *obj); int bpf_recv_map_fds(const char *path, int *fds, struct bpf_map_aux *aux, diff --git a/ip/ipaddress.c b/ip/ipaddress.c index f06f5829..cf8ef818 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/ip/iplink.c b/ip/iplink.c index 9674cb65..5aff2fde 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include diff --git a/ip/iproute.c b/ip/iproute.c index 4e022d77..a735d281 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -1731,6 +1731,16 @@ static int iproute_get(int argc, char **argv) addattr32(&req.n, sizeof(req), RTA_UID, uid); } else if (matches(*argv, "fibmatch") == 0) { fib_match = 1; + } else if (strcmp(*argv, "as") == 0) { + inet_prefix addr; + + NEXT_ARG(); + if (strcmp(*argv, "to") == 0) + NEXT_ARG(); + get_addr(&addr, *argv, req.r.rtm_family); + if (req.r.rtm_family == AF_UNSPEC) + req.r.rtm_family = addr.family; + addattr_l(&req.n, sizeof(req), RTA_NEWDST, &addr.data, addr.bytelen); } else { inet_prefix addr; diff --git a/lib/bpf.c b/lib/bpf.c index 6b5a96d0..7eb5cd96 100644 --- a/lib/bpf.c +++ b/lib/bpf.c @@ -152,6 +152,54 @@ static int bpf_map_update(int fd, const void *key, const void *value, return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); } +static int bpf_prog_fd_by_id(uint32_t id) +{ + union bpf_attr attr = {}; + + attr.prog_id = id; + + return bpf(BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr)); +} + +static int bpf_prog_info_by_fd(int fd, struct bpf_prog_info *info, + uint32_t *info_len) +{ + union bpf_attr attr = {}; + int ret; + + attr.info.bpf_fd = fd; + attr.info.info = bpf_ptr_to_u64(info); + attr.info.info_len = *info_len; + + *info_len = 0; + ret = bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); + if (!ret) + *info_len = attr.info.info_len; + + return ret; +} + +void bpf_dump_prog_info(FILE *f, uint32_t id) +{ + struct bpf_prog_info info = {}; + uint32_t len = sizeof(info); + int fd, ret; + + fprintf(f, "id %u ", id); + + fd = bpf_prog_fd_by_id(id); + if (fd < 0) + return; + + ret = bpf_prog_info_by_fd(fd, &info, &len); + if (!ret && len) { + if (info.jited_prog_len) + fprintf(f, "jited "); + } + + close(fd); +} + static int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len, char **bpf_string, bool *need_release, const char separator) @@ -1023,15 +1071,16 @@ static int bpf_log_realloc(struct bpf_elf_ctx *ctx) static int bpf_map_create(enum bpf_map_type type, uint32_t size_key, uint32_t size_value, uint32_t max_elem, - uint32_t flags) + uint32_t flags, int inner_fd) { union bpf_attr attr = {}; attr.map_type = type; attr.key_size = size_key; - attr.value_size = size_value; + attr.value_size = inner_fd ? sizeof(int) : size_value; attr.max_entries = max_elem; attr.map_flags = flags; + attr.inner_map_fd = inner_fd; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } @@ -1343,7 +1392,7 @@ retry: static void bpf_map_report(int fd, const char *name, const struct bpf_elf_map *map, - struct bpf_elf_ctx *ctx) + struct bpf_elf_ctx *ctx, int inner_fd) { fprintf(stderr, "Map object \'%s\' %s%s (%d)!\n", name, fd < 0 ? "rejected: " : "loaded", @@ -1354,15 +1403,91 @@ static void bpf_map_report(int fd, const char *name, fprintf(stderr, " - Identifier: %u\n", map->id); fprintf(stderr, " - Pinning: %u\n", map->pinning); fprintf(stderr, " - Size key: %u\n", map->size_key); - fprintf(stderr, " - Size value: %u\n", map->size_value); + fprintf(stderr, " - Size value: %u\n", + inner_fd ? (int)sizeof(int) : map->size_value); fprintf(stderr, " - Max elems: %u\n", map->max_elem); fprintf(stderr, " - Flags: %#x\n\n", map->flags); } -static int bpf_map_attach(const char *name, const struct bpf_elf_map *map, - struct bpf_elf_ctx *ctx) +static int bpf_find_map_id(const struct bpf_elf_ctx *ctx, uint32_t id) { - int fd, ret; + int i; + + for (i = 0; i < ctx->map_num; i++) { + if (ctx->maps[i].id != id) + continue; + if (ctx->map_fds[i] < 0) + return -EINVAL; + + return ctx->map_fds[i]; + } + + return -ENOENT; +} + +static int bpf_derive_elf_map_from_fdinfo(int fd, struct bpf_elf_map *map) +{ + char file[PATH_MAX], buff[4096]; + unsigned int val; + FILE *fp; + + snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd); + + memset(map, 0, sizeof(*map)); + + fp = fopen(file, "r"); + if (!fp) { + fprintf(stderr, "No procfs support?!\n"); + return -EIO; + } + + while (fgets(buff, sizeof(buff), fp)) { + if (sscanf(buff, "map_type:\t%u", &val) == 1) + map->type = val; + else if (sscanf(buff, "key_size:\t%u", &val) == 1) + map->size_key = val; + else if (sscanf(buff, "value_size:\t%u", &val) == 1) + map->size_value = val; + else if (sscanf(buff, "max_entries:\t%u", &val) == 1) + map->max_elem = val; + else if (sscanf(buff, "map_flags:\t%i", &val) == 1) + map->flags = val; + } + + fclose(fp); + return 0; +} + +static void bpf_report_map_in_map(int outer_fd, int inner_fd, uint32_t idx) +{ + struct bpf_elf_map outer_map; + int ret; + + fprintf(stderr, "Cannot insert map into map! "); + + ret = bpf_derive_elf_map_from_fdinfo(outer_fd, &outer_map); + if (!ret) { + if (idx >= outer_map.max_elem && + outer_map.type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { + fprintf(stderr, "Outer map has %u elements, index %u is invalid!\n", + outer_map.max_elem, idx); + return; + } + } + + fprintf(stderr, "Different map specs used for outer and inner map?\n"); +} + +static bool bpf_is_map_in_map_type(const struct bpf_elf_map *map) +{ + return map->type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + map->type == BPF_MAP_TYPE_HASH_OF_MAPS; +} + +static int bpf_map_attach(const char *name, const struct bpf_elf_map *map, + struct bpf_elf_ctx *ctx, int *have_map_in_map) +{ + int fd, ret, map_inner_fd = 0; fd = bpf_probe_pinned(name, ctx, map->pinning); if (fd > 0) { @@ -1381,11 +1506,29 @@ static int bpf_map_attach(const char *name, const struct bpf_elf_map *map, return fd; } + if (have_map_in_map && bpf_is_map_in_map_type(map)) { + (*have_map_in_map)++; + if (map->inner_id) + return 0; + fprintf(stderr, "Map \'%s\' cannot be created since no inner map ID defined!\n", + name); + return -EINVAL; + } + + if (!have_map_in_map && bpf_is_map_in_map_type(map)) { + map_inner_fd = bpf_find_map_id(ctx, map->inner_id); + if (map_inner_fd < 0) { + fprintf(stderr, "Map \'%s\' cannot be loaded. Inner map with ID %u not found!\n", + name, map->inner_id); + return -EINVAL; + } + } + errno = 0; fd = bpf_map_create(map->type, map->size_key, map->size_value, - map->max_elem, map->flags); + map->max_elem, map->flags, map_inner_fd); if (fd < 0 || ctx->verbose) { - bpf_map_report(fd, name, map, ctx); + bpf_map_report(fd, name, map, ctx, map_inner_fd); if (fd < 0) return fd; } @@ -1430,21 +1573,63 @@ static const char *bpf_map_fetch_name(struct bpf_elf_ctx *ctx, int which) static int bpf_maps_attach_all(struct bpf_elf_ctx *ctx) { + int i, j, ret, fd, inner_fd, inner_idx, have_map_in_map = 0; const char *map_name; - int i, fd; for (i = 0; i < ctx->map_num; i++) { map_name = bpf_map_fetch_name(ctx, i); if (!map_name) return -EIO; - fd = bpf_map_attach(map_name, &ctx->maps[i], ctx); + fd = bpf_map_attach(map_name, &ctx->maps[i], ctx, + &have_map_in_map); + if (fd < 0) + return fd; + + ctx->map_fds[i] = !fd ? -1 : fd; + } + + for (i = 0; have_map_in_map && i < ctx->map_num; i++) { + if (ctx->map_fds[i] >= 0) + continue; + + map_name = bpf_map_fetch_name(ctx, i); + if (!map_name) + return -EIO; + + fd = bpf_map_attach(map_name, &ctx->maps[i], ctx, + NULL); if (fd < 0) return fd; ctx->map_fds[i] = fd; } + for (i = 0; have_map_in_map && i < ctx->map_num; i++) { + if (!ctx->maps[i].id || + ctx->maps[i].inner_id || + ctx->maps[i].inner_idx == -1) + continue; + + inner_fd = ctx->map_fds[i]; + inner_idx = ctx->maps[i].inner_idx; + + for (j = 0; j < ctx->map_num; j++) { + if (!bpf_is_map_in_map_type(&ctx->maps[j])) + continue; + if (ctx->maps[j].inner_id != ctx->maps[i].id) + continue; + + ret = bpf_map_update(ctx->map_fds[j], &inner_idx, + &inner_fd, BPF_ANY); + if (ret < 0) { + bpf_report_map_in_map(ctx->map_fds[j], + inner_fd, inner_idx); + return ret; + } + } + } + return 0; } diff --git a/lib/ll_addr.c b/lib/ll_addr.c index 465ed6fa..5b5caf3d 100644 --- a/lib/ll_addr.c +++ b/lib/ll_addr.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/lib/ll_proto.c b/lib/ll_proto.c index e094d9f8..ef5a5b7b 100644 --- a/lib/ll_proto.c +++ b/lib/ll_proto.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/lib/ll_types.c b/lib/ll_types.c index eca617f3..8f294515 100644 --- a/lib/ll_types.c +++ b/lib/ll_types.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/man/man8/tc-csum.8 b/man/man8/tc-csum.8 index 718301de..409ab717 100644 --- a/man/man8/tc-csum.8 +++ b/man/man8/tc-csum.8 @@ -29,9 +29,9 @@ csum - checksum update action The .B csum action triggers checksum recalculation of specified packet headers. It is -commonly used after packet editing using the +commonly used to fix incorrect checksums after the .B pedit -action to fix for then incorrect checksums. +action has modified the packet content. .SH OPTIONS .TP .I TARGET diff --git a/netem/paretonormal.c b/netem/paretonormal.c index 83ec87d4..9773e370 100644 --- a/netem/paretonormal.c +++ b/netem/paretonormal.c @@ -11,7 +11,6 @@ */ #include #include -#include #include #include #include diff --git a/tc/f_bpf.c b/tc/f_bpf.c index 75c44c06..2f8d12a6 100644 --- a/tc/f_bpf.c +++ b/tc/f_bpf.c @@ -230,6 +230,9 @@ static int bpf_print_opt(struct filter_util *qu, FILE *f, b, sizeof(b))); } + if (tb[TCA_BPF_ID]) + bpf_dump_prog_info(f, rta_getattr_u32(tb[TCA_BPF_ID])); + if (tb[TCA_BPF_POLICE]) { fprintf(f, "\n"); tc_print_police(f, tb[TCA_BPF_POLICE]); diff --git a/tc/m_bpf.c b/tc/m_bpf.c index 57283030..df559bcc 100644 --- a/tc/m_bpf.c +++ b/tc/m_bpf.c @@ -186,6 +186,9 @@ static int bpf_print_opt(struct action_util *au, FILE *f, struct rtattr *arg) b, sizeof(b))); } + if (tb[TCA_ACT_BPF_ID]) + bpf_dump_prog_info(f, rta_getattr_u32(tb[TCA_ACT_BPF_ID])); + print_action_control(f, "default-action ", parm->action, "\n"); fprintf(f, "\tindex %u ref %d bind %d", parm->index, parm->refcnt, parm->bindcnt);