From 517771e2718299996b21acf4d9ca7995e58be6d0 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 18 Jul 2017 17:16:56 -0700
Subject: [PATCH 1/7] update headers to 4.13-rc1

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 include/linux/bpf.h   | 82 ++++++++++++++++++++++++++++++++++++++++++-
 include/linux/magic.h |  3 ++
 include/linux/sctp.h  |  6 ++++
 3 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8a966ef0..fb84d2c2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -120,12 +120,14 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_IN,
 	BPF_PROG_TYPE_LWT_OUT,
 	BPF_PROG_TYPE_LWT_XMIT,
+	BPF_PROG_TYPE_SOCK_OPS,
 };
 
 enum bpf_attach_type {
 	BPF_CGROUP_INET_INGRESS,
 	BPF_CGROUP_INET_EGRESS,
 	BPF_CGROUP_INET_SOCK_CREATE,
+	BPF_CGROUP_SOCK_OPS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -518,6 +520,25 @@ union bpf_attr {
  *     Set full skb->hash.
  *     @skb: pointer to skb
  *     @hash: hash to set
+ *
+ * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
+ *     Calls setsockopt. Not all opts are available, only those with
+ *     integer optvals plus TCP_CONGESTION.
+ *     Supported levels: SOL_SOCKET and IPROTO_TCP
+ *     @bpf_socket: pointer to bpf_socket
+ *     @level: SOL_SOCKET or IPROTO_TCP
+ *     @optname: option name
+ *     @optval: pointer to option value
+ *     @optlen: length of optval in byes
+ *     Return: 0 or negative error
+ *
+ * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
+ *     Grow or shrink room in sk_buff.
+ *     @skb: pointer to skb
+ *     @len_diff: (signed) amount of room to grow/shrink
+ *     @mode: operation mode (enum bpf_adj_room_mode)
+ *     @flags: reserved for future use
+ *     Return: 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -568,7 +589,9 @@ union bpf_attr {
 	FN(probe_read_str),		\
 	FN(get_socket_cookie),		\
 	FN(get_socket_uid),		\
-	FN(set_hash),
+	FN(set_hash),			\
+	FN(setsockopt),			\
+	FN(skb_adjust_room),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -618,6 +641,11 @@ enum bpf_func_id {
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
 #define BPF_F_CTXLEN_MASK		(0xfffffULL << 32)
 
+/* Mode for BPF_FUNC_skb_adjust_room helper. */
+enum bpf_adj_room_mode {
+	BPF_ADJ_ROOM_NET,
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
@@ -720,4 +748,56 @@ struct bpf_map_info {
 	__u32 map_flags;
 } __attribute__((aligned(8)));
 
+/* User bpf_sock_ops struct to access socket values and specify request ops
+ * and their replies.
+ * Some of this fields are in network (bigendian) byte order and may need
+ * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h).
+ * New fields can only be added at the end of this structure
+ */
+struct bpf_sock_ops {
+	__u32 op;
+	union {
+		__u32 reply;
+		__u32 replylong[4];
+	};
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
+};
+
+/* List of known BPF sock_ops operators.
+ * New entries can only be added at the end
+ */
+enum {
+	BPF_SOCK_OPS_VOID,
+	BPF_SOCK_OPS_TIMEOUT_INIT,	/* Should return SYN-RTO value to use or
+					 * -1 if default value should be used
+					 */
+	BPF_SOCK_OPS_RWND_INIT,		/* Should return initial advertized
+					 * window (in packets) or -1 if default
+					 * value should be used
+					 */
+	BPF_SOCK_OPS_TCP_CONNECT_CB,	/* Calls BPF program right before an
+					 * active connection is initialized
+					 */
+	BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB,	/* Calls BPF program when an
+						 * active connection is
+						 * established
+						 */
+	BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,	/* Calls BPF program when a
+						 * passive connection is
+						 * established
+						 */
+	BPF_SOCK_OPS_NEEDS_ECN,		/* If connection's congestion control
+					 * needs ECN
+					 */
+};
+
+#define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
+#define TCP_BPF_SNDCWND_CLAMP	1002	/* Set sndcwnd_clamp */
+
 #endif /* __LINUX_BPF_H__ */
diff --git a/include/linux/magic.h b/include/linux/magic.h
index e230af2e..e439565d 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -42,6 +42,7 @@
 #define MSDOS_SUPER_MAGIC	0x4d44		/* MD */
 #define NCP_SUPER_MAGIC		0x564c		/* Guess, what 0x564c is :-) */
 #define NFS_SUPER_MAGIC		0x6969
+#define OCFS2_SUPER_MAGIC	0x7461636f
 #define OPENPROM_SUPER_MAGIC	0x9fa1
 #define QNX4_SUPER_MAGIC	0x002f		/* qnx4 fs detection */
 #define QNX6_SUPER_MAGIC	0x68191122	/* qnx6 fs detection */
@@ -80,6 +81,8 @@
 #define BTRFS_TEST_MAGIC	0x73727279
 #define NSFS_MAGIC		0x6e736673
 #define BPF_FS_MAGIC		0xcafe4a11
+#define AAFS_MAGIC		0x5a3c69f0
+
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC		0x15013346
 #define BALLOON_KVM_MAGIC	0x13661366
diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 4a169feb..fec24c41 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -121,6 +121,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_RESET_STREAMS	119
 #define SCTP_RESET_ASSOC	120
 #define SCTP_ADD_STREAMS	121
+#define SCTP_SOCKOPT_PEELOFF_FLAGS 122
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -972,6 +973,11 @@ typedef struct {
 	int sd;
 } sctp_peeloff_arg_t;
 
+typedef struct {
+	sctp_peeloff_arg_t p_arg;
+	unsigned flags;
+} sctp_peeloff_flags_arg_t;
+
 /*
  *  Peer Address Thresholds socket option
  */

From 89ec74a3eae28c1a48c81de47cf43527b914cbc4 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Wed, 12 Jul 2017 08:27:09 -0700
Subject: [PATCH 2/7] remove duplicated #include's

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 ip/ipaddress.c       | 1 -
 ip/iplink.c          | 1 -
 lib/ll_addr.c        | 1 -
 lib/ll_proto.c       | 1 -
 lib/ll_types.c       | 1 -
 netem/paretonormal.c | 1 -
 6 files changed, 6 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index f06f5829..cf8ef818 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -18,7 +18,6 @@
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
-#include <sys/ioctl.h>
 #include <sys/param.h>
 #include <errno.h>
 #include <netinet/in.h>
diff --git a/ip/iplink.c b/ip/iplink.c
index 9674cb65..5aff2fde 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -26,7 +26,6 @@
 #include <arpa/inet.h>
 #include <string.h>
 #include <sys/ioctl.h>
-#include <linux/sockios.h>
 #include <stdbool.h>
 #include <linux/mpls.h>
 
diff --git a/lib/ll_addr.c b/lib/ll_addr.c
index 465ed6fa..5b5caf3d 100644
--- a/lib/ll_addr.c
+++ b/lib/ll_addr.c
@@ -16,7 +16,6 @@
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
-#include <sys/ioctl.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <string.h>
diff --git a/lib/ll_proto.c b/lib/ll_proto.c
index e094d9f8..ef5a5b7b 100644
--- a/lib/ll_proto.c
+++ b/lib/ll_proto.c
@@ -16,7 +16,6 @@
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
-#include <sys/ioctl.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <string.h>
diff --git a/lib/ll_types.c b/lib/ll_types.c
index eca617f3..8f294515 100644
--- a/lib/ll_types.c
+++ b/lib/ll_types.c
@@ -16,7 +16,6 @@
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
-#include <sys/ioctl.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <string.h>
diff --git a/netem/paretonormal.c b/netem/paretonormal.c
index 83ec87d4..9773e370 100644
--- a/netem/paretonormal.c
+++ b/netem/paretonormal.c
@@ -11,7 +11,6 @@
  */
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include <limits.h>

From 2e86ed542d0d89e57115ba2115303f327652f3a7 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Fri, 7 Jul 2017 15:08:11 -0700
Subject: [PATCH 3/7] iproute: extend route get for mpls routes

This patch extends route get to support mpls specific
route attributes like RTA_NEWDST.

Input:
RTA_DST - input label
RTA_NEWDST - labels in packet for multipath selection

By default the getroute handler returns matched
nexthop label, via and oif

With fibmatch keyword (RTM_F_FIB_MATCH flag), full matched
route is returned.

example:
$ip -f mpls route show
101
        nexthop as to 102/103 via inet 172.16.2.2 dev virt1-2
        nexthop as to 302/303 via inet 172.16.12.2 dev virt1-12
201
        nexthop as to 202/203 via inet6 2001:db8:2::2 dev virt1-2
        nexthop as to 402/403 via inet6 2001:db8:12::2 dev virt1-12

$ip -f mpls route get 103
RTNETLINK answers: Network is unreachable

$ip -f mpls route get 101
101 as to 102/103 via inet 172.16.2.2 dev virt1-2

$ip -f mpls route get as to 302/303 101
101 as to 302/303 via inet 172.16.12.2 dev virt1-12

$ip -f mpls route get fibmatch 103
RTNETLINK answers: Network is unreachable

$ip -f mpls route get fibmatch 101
101
        nexthop as to 102/103 via inet 172.16.2.2 dev virt1-2
        nexthop as to 302/303 via inet 172.16.12.2 dev virt1-12

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 ip/iproute.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/ip/iproute.c b/ip/iproute.c
index 4e022d77..a735d281 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -1731,6 +1731,16 @@ static int iproute_get(int argc, char **argv)
 			addattr32(&req.n, sizeof(req), RTA_UID, uid);
 		} else if (matches(*argv, "fibmatch") == 0) {
 			fib_match = 1;
+		} else if (strcmp(*argv, "as") == 0) {
+			inet_prefix addr;
+
+			NEXT_ARG();
+			if (strcmp(*argv, "to") == 0)
+				NEXT_ARG();
+			get_addr(&addr, *argv, req.r.rtm_family);
+			if (req.r.rtm_family == AF_UNSPEC)
+				req.r.rtm_family = addr.family;
+			addattr_l(&req.n, sizeof(req), RTA_NEWDST, &addr.data, addr.bytelen);
 		} else {
 			inet_prefix addr;
 

From 23b2ed2d647cf2bf9a1542e73de4743b8340f123 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 17 Jul 2017 17:18:50 +0200
Subject: [PATCH 4/7] bpf: remove obsolete samples

Remove old samples that have been added in pre BPF fs days which were
using file descriptor passing. It's long obsolete and not encouraged
to use this method given BPF fs is the default way like in the other
samples.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 examples/bpf/bpf_agent.c  | 258 --------------------
 examples/bpf/bpf_prog.c   | 501 --------------------------------------
 examples/bpf/bpf_shared.h |  22 --
 examples/bpf/bpf_sys.h    |  23 --
 4 files changed, 804 deletions(-)
 delete mode 100644 examples/bpf/bpf_agent.c
 delete mode 100644 examples/bpf/bpf_prog.c
 delete mode 100644 examples/bpf/bpf_shared.h
 delete mode 100644 examples/bpf/bpf_sys.h

diff --git a/examples/bpf/bpf_agent.c b/examples/bpf/bpf_agent.c
deleted file mode 100644
index f9b9ce3c..00000000
--- a/examples/bpf/bpf_agent.c
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * eBPF user space agent part
- *
- * Simple, _self-contained_ user space agent for the eBPF kernel
- * ebpf_prog.c program, which gets all map fds passed from tc via unix
- * domain socket in one transaction and can thus keep referencing
- * them from user space in order to read out (or possibly modify)
- * map data. Here, just as a minimal example to display counters.
- *
- * The agent only uses the bpf(2) syscall API to read or possibly
- * write to eBPF maps, it doesn't need to be aware of the low-level
- * bytecode parts and/or ELF parsing bits.
- *
- * ! For more details, see header comment in bpf_prog.c !
- *
- * gcc bpf_agent.c -o bpf_agent -Wall -O2
- *
- * For example, a more complex user space agent could run on each
- * host, reading and writing into eBPF maps used by tc classifier
- * and actions. It would thus allow for implementing a distributed
- * tc architecture, for example, which would push down central
- * policies into eBPF maps, and thus altering run-time behaviour.
- *
- *   -- Happy eBPF hacking! ;)
- */
-
-#define _GNU_SOURCE
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <assert.h>
-
-#include <sys/un.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-
-/* Just some misc macros as min(), offsetof(), etc. */
-#include "../../include/utils.h"
-/* Common code from fd passing. */
-#include "../../include/bpf_scm.h"
-/* Common, shared definitions with ebpf_prog.c */
-#include "bpf_shared.h"
-/* Mini syscall wrapper */
-#include "bpf_sys.h"
-
-static void bpf_dump_drops(int fd)
-{
-	int cpu, max;
-
-	max = sysconf(_SC_NPROCESSORS_ONLN);
-
-	printf(" `- number of drops:");
-	for (cpu = 0; cpu < max; cpu++) {
-		long drops;
-
-		assert(bpf_lookup_elem(fd, &cpu, &drops) == 0);
-		printf("\tcpu%d: %5ld", cpu, drops);
-	}
-	printf("\n");
-}
-
-static void bpf_dump_queue(int fd)
-{
-	/* Just for the same of the example. */
-	int max_queue = 4, i;
-
-	printf("  | nic queues:");
-	for (i = 0; i < max_queue; i++) {
-		struct count_queue cq;
-		int ret;
-
-		memset(&cq, 0, sizeof(cq));
-		ret = bpf_lookup_elem(fd, &i, &cq);
-		assert(ret == 0 || (ret < 0 && errno == ENOENT));
-
-		printf("\tq%d:[pkts: %ld, mis: %ld]",
-		       i, cq.total, cq.mismatch);
-	}
-	printf("\n");
-}
-
-static void bpf_dump_proto(int fd)
-{
-	uint8_t protos[] = { IPPROTO_TCP, IPPROTO_UDP, IPPROTO_ICMP };
-	char *names[] = { "tcp", "udp", "icmp" };
-	int i;
-
-	printf("  ` protos:");
-	for (i = 0; i < ARRAY_SIZE(protos); i++) {
-		struct count_tuple ct;
-		int ret;
-
-		memset(&ct, 0, sizeof(ct));
-		ret = bpf_lookup_elem(fd, &protos[i], &ct);
-		assert(ret == 0 || (ret < 0 && errno == ENOENT));
-
-		printf("\t%s:[pkts: %ld, bytes: %ld]",
-		       names[i], ct.packets, ct.bytes);
-	}
-	printf("\n");
-}
-
-static void bpf_dump_map_data(int *tfd)
-{
-	int i;
-
-	for (i = 0; i < 30; i++) {
-		const int period = 5;
-
-		printf("data, period: %dsec\n", period);
-
-		bpf_dump_drops(tfd[BPF_MAP_ID_DROPS]);
-		bpf_dump_queue(tfd[BPF_MAP_ID_QUEUE]);
-		bpf_dump_proto(tfd[BPF_MAP_ID_PROTO]);
-
-		sleep(period);
-	}
-}
-
-static void bpf_info_loop(int *fds, struct bpf_map_aux *aux)
-{
-	int i, tfd[BPF_MAP_ID_MAX];
-
-	printf("ver: %d\nobj: %s\ndev: %lu\nino: %lu\nmaps: %u\n",
-	       aux->uds_ver, aux->obj_name, aux->obj_st.st_dev,
-	       aux->obj_st.st_ino, aux->num_ent);
-
-	for (i = 0; i < aux->num_ent; i++) {
-		printf("map%d:\n", i);
-		printf(" `- fd: %u\n", fds[i]);
-		printf("  | serial: %u\n", aux->ent[i].id);
-		printf("  | type: %u\n", aux->ent[i].type);
-		printf("  | max elem: %u\n", aux->ent[i].max_elem);
-		printf("  | size key: %u\n", aux->ent[i].size_key);
-		printf("  ` size val: %u\n", aux->ent[i].size_value);
-
-		tfd[aux->ent[i].id] = fds[i];
-	}
-
-	bpf_dump_map_data(tfd);
-}
-
-static void bpf_map_get_from_env(int *tfd)
-{
-	char key[64], *val;
-	int i;
-
-	for (i = 0; i < BPF_MAP_ID_MAX; i++) {
-		memset(key, 0, sizeof(key));
-		snprintf(key, sizeof(key), "BPF_MAP%d", i);
-
-		val = getenv(key);
-		assert(val != NULL);
-
-		tfd[i] = atoi(val);
-	}
-}
-
-static int bpf_map_set_recv(int fd, int *fds,  struct bpf_map_aux *aux,
-			    unsigned int entries)
-{
-	struct bpf_map_set_msg msg;
-	int *cmsg_buf, min_fd, i;
-	char *amsg_buf, *mmsg_buf;
-
-	cmsg_buf = bpf_map_set_init(&msg, NULL, 0);
-	amsg_buf = (char *)msg.aux.ent;
-	mmsg_buf = (char *)&msg.aux;
-
-	for (i = 0; i < entries; i += min_fd) {
-		struct cmsghdr *cmsg;
-		int ret;
-
-		min_fd = min(BPF_SCM_MAX_FDS * 1U, entries - i);
-
-		bpf_map_set_init_single(&msg, min_fd);
-
-		ret = recvmsg(fd, &msg.hdr, 0);
-		if (ret <= 0)
-			return ret ? : -1;
-
-		cmsg = CMSG_FIRSTHDR(&msg.hdr);
-		if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS)
-			return -EINVAL;
-		if (msg.hdr.msg_flags & MSG_CTRUNC)
-			return -EIO;
-
-		min_fd = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof(fd);
-		if (min_fd > entries || min_fd <= 0)
-			return -1;
-
-		memcpy(&fds[i], cmsg_buf, sizeof(fds[0]) * min_fd);
-		memcpy(&aux->ent[i], amsg_buf, sizeof(aux->ent[0]) * min_fd);
-		memcpy(aux, mmsg_buf, offsetof(struct bpf_map_aux, ent));
-
-		if (i + min_fd == aux->num_ent)
-			break;
-	}
-
-	return 0;
-}
-
-int main(int argc, char **argv)
-{
-	int fds[BPF_SCM_MAX_FDS];
-	struct bpf_map_aux aux;
-	struct sockaddr_un addr;
-	int fd, ret, i;
-
-	/* When arguments are being passed, we take it as a path
-	 * to a Unix domain socket, otherwise we grab the fds
-	 * from the environment to demonstrate both possibilities.
-	 */
-	if (argc == 1) {
-		int tfd[BPF_MAP_ID_MAX];
-
-		bpf_map_get_from_env(tfd);
-		bpf_dump_map_data(tfd);
-
-		return 0;
-	}
-
-	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
-	if (fd < 0) {
-		fprintf(stderr, "Cannot open socket: %s\n",
-			strerror(errno));
-		exit(1);
-	}
-
-	memset(&addr, 0, sizeof(addr));
-	addr.sun_family = AF_UNIX;
-	strncpy(addr.sun_path, argv[argc - 1], sizeof(addr.sun_path));
-
-	ret = bind(fd, (struct sockaddr *)&addr, sizeof(addr));
-	if (ret < 0) {
-		fprintf(stderr, "Cannot bind to socket: %s\n",
-			strerror(errno));
-		exit(1);
-	}
-
-	memset(fds, 0, sizeof(fds));
-	memset(&aux, 0, sizeof(aux));
-
-	ret = bpf_map_set_recv(fd, fds, &aux, BPF_SCM_MAX_FDS);
-	if (ret >= 0)
-		bpf_info_loop(fds, &aux);
-
-	for (i = 0; i < aux.num_ent; i++)
-		close(fds[i]);
-
-	close(fd);
-	return 0;
-}
diff --git a/examples/bpf/bpf_prog.c b/examples/bpf/bpf_prog.c
deleted file mode 100644
index d6caf374..00000000
--- a/examples/bpf/bpf_prog.c
+++ /dev/null
@@ -1,501 +0,0 @@
-/*
- * eBPF kernel space program part
- *
- * Toy eBPF program for demonstration purposes, some parts derived from
- * kernel tree's samples/bpf/sockex2_kern.c example.
- *
- * More background on eBPF, kernel tree: Documentation/networking/filter.txt
- *
- * Note, this file is rather large, and most classifier and actions are
- * likely smaller to accomplish one specific use-case and are tailored
- * for high performance. For performance reasons, you might also have the
- * classifier and action already merged inside the classifier.
- *
- * In order to show various features it serves as a bigger programming
- * example, which you should feel free to rip apart and experiment with.
- *
- * Compilation, configuration example:
- *
- *  Note: as long as the BPF backend in LLVM is still experimental,
- *  you need to build LLVM with LLVM with --enable-experimental-targets=BPF
- *  Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y,
- *  and you have libelf.h and gelf.h headers and can link tc against -lelf.
- *
- *  In case you need to sync kernel headers, go to your kernel source tree:
- *  # make headers_install INSTALL_HDR_PATH=/usr/
- *
- *  $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH
- *  $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o
- *  $ objdump -h bpf.o
- *  [...]
- *  3 classifier    000007f8  0000000000000000  0000000000000000  00000040  2**3
- *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
- *  4 action-mark   00000088  0000000000000000  0000000000000000  00000838  2**3
- *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
- *  5 action-rand   00000098  0000000000000000  0000000000000000  000008c0  2**3
- *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
- *  6 maps          00000030  0000000000000000  0000000000000000  00000958  2**2
- *                  CONTENTS, ALLOC, LOAD, DATA
- *  7 license       00000004  0000000000000000  0000000000000000  00000988  2**0
- *                  CONTENTS, ALLOC, LOAD, DATA
- *  [...]
- *  # echo 1 > /proc/sys/net/core/bpf_jit_enable
- *  $ gcc bpf_agent.c -o bpf_agent -Wall -O2
- *  # ./bpf_agent /tmp/bpf-uds      (e.g. on a different terminal)
- *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
- *                             action bpf obj bpf.o sec action-mark            \
- *                             action bpf obj bpf.o sec action-rand ok
- *  # tc filter show dev em1
- *  filter parent 1: protocol all pref 49152 bpf
- *  filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier]
- *    action order 1: bpf bpf.o:[action-mark] default-action pipe
- *    index 52 ref 1 bind 1
- *
- *    action order 2: bpf bpf.o:[action-rand] default-action pipe
- *    index 53 ref 1 bind 1
- *
- *    action order 3: gact action pass
- *    random type none pass val 0
- *    index 38 ref 1 bind 1
- *
- * The same program can also be installed on ingress side (as opposed to above
- * egress configuration), e.g.:
- *
- * # tc qdisc add dev em1 handle ffff: ingress
- * # tc filter add dev em1 parent ffff: bpf obj ...
- *
- * Notes on BPF agent:
- *
- * In the above example, the bpf_agent creates the unix domain socket
- * natively. "tc exec" can also spawn a shell and hold the socktes there:
- *
- *  # tc exec bpf imp /tmp/bpf-uds
- *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
- *                             action bpf obj bpf.o sec action-mark            \
- *                             action bpf obj bpf.o sec action-rand ok
- *  sh-4.2# (shell spawned from tc exec)
- *  sh-4.2# bpf_agent
- *  [...]
- *
- * This will read out fds over environment and produce the same data dump
- * as below. This has the advantage that the spawned shell owns the fds
- * and thus if the agent is restarted, it can reattach to the same fds, also
- * various programs can easily read/modify the data simultaneously from user
- * space side.
- *
- * If the shell is unnecessary, the agent can also just be spawned directly
- * via tc exec:
- *
- *  # tc exec bpf imp /tmp/bpf-uds run bpf_agent
- *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
- *                             action bpf obj bpf.o sec action-mark            \
- *                             action bpf obj bpf.o sec action-rand ok
- *
- * BPF agent example output:
- *
- * ver: 1
- * obj: bpf.o
- * dev: 64770
- * ino: 6045133
- * maps: 3
- * map0:
- *  `- fd: 4
- *   | serial: 1
- *   | type: 1
- *   | max elem: 256
- *   | size key: 1
- *   ` size val: 16
- * map1:
- *  `- fd: 5
- *   | serial: 2
- *   | type: 1
- *   | max elem: 1024
- *   | size key: 4
- *   ` size val: 16
- * map2:
- *  `- fd: 6
- *   | serial: 3
- *   | type: 2
- *   | max elem: 64
- *   | size key: 4
- *   ` size val: 8
- * data, period: 5sec
- *  `- number of drops:	cpu0:     0	cpu1:     0	cpu2:     0	cpu3:     0
- *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 0, mis: 0]	q3:[pkts: 0, mis: 0]
- *   ` protos:	tcp:[pkts: 0, bytes: 0]	udp:[pkts: 0, bytes: 0]	icmp:[pkts: 0, bytes: 0]
- * data, period: 5sec
- *  `- number of drops:	cpu0:     5	cpu1:     0	cpu2:     0	cpu3:     1
- *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 24, mis: 14]	q3:[pkts: 0, mis: 0]
- *   ` protos:	tcp:[pkts: 13, bytes: 1989]	udp:[pkts: 10, bytes: 710]	icmp:[pkts: 0, bytes: 0]
- * data, period: 5sec
- *  `- number of drops:	cpu0:     5	cpu1:     0	cpu2:     3	cpu3:     3
- *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 39, mis: 21]	q3:[pkts: 0, mis: 0]
- *   ` protos:	tcp:[pkts: 20, bytes: 3549]	udp:[pkts: 18, bytes: 1278]	icmp:[pkts: 0, bytes: 0]
- * [...]
- *
- * This now means, the below classifier and action pipeline has been loaded
- * as eBPF bytecode into the kernel, the kernel has verified that the
- * execution of the bytecode is "safe", and it has JITed the programs
- * afterwards, so that upon invocation they're running on native speed. tc
- * has transferred all map file descriptors to the bpf_agent via IPC and
- * even after tc exits, the agent can read out or modify all map data.
- *
- * Note that the export to the uds is done only once in the classifier and
- * not in the action. It's enough to export the (here) shared descriptors
- * once.
- *
- * If you need to disassemble the generated JIT image (echo with 2), the
- * kernel tree has under tools/net/ a small helper, you can invoke e.g.
- * `bpf_jit_disasm -o`.
- *
- * Please find in the code below further comments.
- *
- *   -- Happy eBPF hacking! ;)
- */
-#include <stdint.h>
-#include <stdbool.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <asm/types.h>
-#include <linux/in.h>
-#include <linux/if.h>
-#include <linux/if_ether.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/if_tunnel.h>
-#include <linux/filter.h>
-#include <linux/bpf.h>
-
-/* Common, shared definitions with ebpf_agent.c. */
-#include "bpf_shared.h"
-/* BPF helper functions for our example. */
-#include "../../include/bpf_api.h"
-
-/* Could be defined here as well, or included from the header. */
-#define TC_ACT_UNSPEC		(-1)
-#define TC_ACT_OK		0
-#define TC_ACT_RECLASSIFY	1
-#define TC_ACT_SHOT		2
-#define TC_ACT_PIPE		3
-#define TC_ACT_STOLEN		4
-#define TC_ACT_QUEUED		5
-#define TC_ACT_REPEAT		6
-
-/* Other, misc stuff. */
-#define IP_MF			0x2000
-#define IP_OFFSET		0x1FFF
-
-/* eBPF map definitions, all placed in section "maps". */
-struct bpf_elf_map __section("maps") map_proto = {
-	.type		=	BPF_MAP_TYPE_HASH,
-	.id		=	BPF_MAP_ID_PROTO,
-	.size_key	=	sizeof(uint8_t),
-	.size_value	=	sizeof(struct count_tuple),
-	.max_elem	=	256,
-	.flags		=	BPF_F_NO_PREALLOC,
-};
-
-struct bpf_elf_map __section("maps") map_queue = {
-	.type		=	BPF_MAP_TYPE_HASH,
-	.id		=	BPF_MAP_ID_QUEUE,
-	.size_key	=	sizeof(uint32_t),
-	.size_value	=	sizeof(struct count_queue),
-	.max_elem	=	1024,
-	.flags		=	BPF_F_NO_PREALLOC,
-};
-
-struct bpf_elf_map __section("maps") map_drops = {
-	.type		=	BPF_MAP_TYPE_ARRAY,
-	.id		=	BPF_MAP_ID_DROPS,
-	.size_key	=	sizeof(uint32_t),
-	.size_value	=	sizeof(long),
-	.max_elem	=	64,
-};
-
-/* Helper functions and definitions for the flow dissector used by the
- * example classifier. This resembles the kernel's flow dissector to
- * some extend and is just used as an example to show what's possible
- * with eBPF.
- */
-struct sockaddr;
-
-struct vlan_hdr {
-	__be16 h_vlan_TCI;
-	__be16 h_vlan_encapsulated_proto;
-};
-
-struct flow_keys {
-	__u32 src;
-	__u32 dst;
-	union {
-		__u32 ports;
-		__u16 port16[2];
-	};
-	__s32 th_off;
-	__u8 ip_proto;
-};
-
-static __inline__ int flow_ports_offset(__u8 ip_proto)
-{
-	switch (ip_proto) {
-	case IPPROTO_TCP:
-	case IPPROTO_UDP:
-	case IPPROTO_DCCP:
-	case IPPROTO_ESP:
-	case IPPROTO_SCTP:
-	case IPPROTO_UDPLITE:
-	default:
-		return 0;
-	case IPPROTO_AH:
-		return 4;
-	}
-}
-
-static __inline__ bool flow_is_frag(struct __sk_buff *skb, int nh_off)
-{
-	return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) &
-		  (IP_MF | IP_OFFSET));
-}
-
-static __inline__ int flow_parse_ipv4(struct __sk_buff *skb, int nh_off,
-				      __u8 *ip_proto, struct flow_keys *flow)
-{
-	__u8 ip_ver_len;
-
-	if (unlikely(flow_is_frag(skb, nh_off)))
-		*ip_proto = 0;
-	else
-		*ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr,
-							     protocol));
-	if (*ip_proto != IPPROTO_GRE) {
-		flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr));
-		flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr));
-	}
-
-	ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */);
-	if (likely(ip_ver_len == 0x45))
-		nh_off += 20;
-	else
-		nh_off += (ip_ver_len & 0xF) << 2;
-
-	return nh_off;
-}
-
-static __inline__ __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off)
-{
-	__u32 w0 = load_word(skb, off);
-	__u32 w1 = load_word(skb, off + sizeof(w0));
-	__u32 w2 = load_word(skb, off + sizeof(w0) * 2);
-	__u32 w3 = load_word(skb, off + sizeof(w0) * 3);
-
-	return w0 ^ w1 ^ w2 ^ w3;
-}
-
-static __inline__ int flow_parse_ipv6(struct __sk_buff *skb, int nh_off,
-				      __u8 *ip_proto, struct flow_keys *flow)
-{
-	*ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr));
-
-	flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr));
-	flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr));
-
-	return nh_off + sizeof(struct ipv6hdr);
-}
-
-static __inline__ bool flow_dissector(struct __sk_buff *skb,
-				      struct flow_keys *flow)
-{
-	int poff, nh_off = BPF_LL_OFF + ETH_HLEN;
-	__be16 proto = skb->protocol;
-	__u8 ip_proto;
-
-	/* TODO: check for skb->vlan_tci, skb->vlan_proto first */
-	if (proto == htons(ETH_P_8021AD)) {
-		proto = load_half(skb, nh_off +
-				  offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
-		nh_off += sizeof(struct vlan_hdr);
-	}
-	if (proto == htons(ETH_P_8021Q)) {
-		proto = load_half(skb, nh_off +
-				  offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
-		nh_off += sizeof(struct vlan_hdr);
-	}
-
-	if (likely(proto == htons(ETH_P_IP)))
-		nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
-	else if (proto == htons(ETH_P_IPV6))
-		nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
-	else
-		return false;
-
-	switch (ip_proto) {
-	case IPPROTO_GRE: {
-		struct gre_hdr {
-			__be16 flags;
-			__be16 proto;
-		};
-
-		__u16 gre_flags = load_half(skb, nh_off +
-					    offsetof(struct gre_hdr, flags));
-		__u16 gre_proto = load_half(skb, nh_off +
-					    offsetof(struct gre_hdr, proto));
-
-		if (gre_flags & (GRE_VERSION | GRE_ROUTING))
-			break;
-
-		nh_off += 4;
-		if (gre_flags & GRE_CSUM)
-			nh_off += 4;
-		if (gre_flags & GRE_KEY)
-			nh_off += 4;
-		if (gre_flags & GRE_SEQ)
-			nh_off += 4;
-
-		if (gre_proto == ETH_P_8021Q) {
-			gre_proto = load_half(skb, nh_off +
-					      offsetof(struct vlan_hdr,
-						       h_vlan_encapsulated_proto));
-			nh_off += sizeof(struct vlan_hdr);
-		}
-		if (gre_proto == ETH_P_IP)
-			nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
-		else if (gre_proto == ETH_P_IPV6)
-			nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
-		else
-			return false;
-		break;
-	}
-	case IPPROTO_IPIP:
-		nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
-		break;
-	case IPPROTO_IPV6:
-		nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
-	default:
-		break;
-	}
-
-	nh_off += flow_ports_offset(ip_proto);
-
-	flow->ports = load_word(skb, nh_off);
-	flow->th_off = nh_off;
-	flow->ip_proto = ip_proto;
-
-	return true;
-}
-
-static __inline__ void cls_update_proto_map(const struct __sk_buff *skb,
-					    const struct flow_keys *flow)
-{
-	uint8_t proto = flow->ip_proto;
-	struct count_tuple *ct, _ct;
-
-	ct = map_lookup_elem(&map_proto, &proto);
-	if (likely(ct)) {
-		lock_xadd(&ct->packets, 1);
-		lock_xadd(&ct->bytes, skb->len);
-		return;
-	}
-
-	/* No hit yet, we need to create a new entry. */
-	_ct.packets = 1;
-	_ct.bytes = skb->len;
-
-	map_update_elem(&map_proto, &proto, &_ct, BPF_ANY);
-}
-
-static __inline__ void cls_update_queue_map(const struct __sk_buff *skb)
-{
-	uint32_t queue = skb->queue_mapping;
-	struct count_queue *cq, _cq;
-	bool mismatch;
-
-	mismatch = skb->queue_mapping != get_smp_processor_id();
-
-	cq = map_lookup_elem(&map_queue, &queue);
-	if (likely(cq)) {
-		lock_xadd(&cq->total, 1);
-		if (mismatch)
-			lock_xadd(&cq->mismatch, 1);
-		return;
-	}
-
-	/* No hit yet, we need to create a new entry. */
-	_cq.total = 1;
-	_cq.mismatch = mismatch ? 1 : 0;
-
-	map_update_elem(&map_queue, &queue, &_cq, BPF_ANY);
-}
-
-/* eBPF program definitions, placed in various sections, which can
- * have custom section names. If custom names are in use, it's
- * required to point tc to the correct section, e.g.
- *
- *     tc filter add [...] bpf obj cls.o sec cls-tos [...]
- *
- * in case the program resides in __section("cls-tos").
- *
- * Default section for cls_bpf is: "classifier", for act_bpf is:
- * "action". Naturally, if for example multiple actions are present
- * in the same file, they need to have distinct section names.
- *
- * It is however not required to have multiple programs sharing
- * a file.
- */
-__section("classifier")
-int cls_main(struct __sk_buff *skb)
-{
-	struct flow_keys flow;
-
-	if (!flow_dissector(skb, &flow))
-		return 0; /* No match in cls_bpf. */
-
-	cls_update_proto_map(skb, &flow);
-	cls_update_queue_map(skb);
-
-	return flow.ip_proto;
-}
-
-static __inline__ void act_update_drop_map(void)
-{
-	uint32_t *count, cpu = get_smp_processor_id();
-
-	count = map_lookup_elem(&map_drops, &cpu);
-	if (count)
-		/* Only this cpu is accessing this element. */
-		(*count)++;
-}
-
-__section("action-mark")
-int act_mark_main(struct __sk_buff *skb)
-{
-	/* You could also mangle skb data here with the helper function
-	 * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could
-	 * do that already in the classifier itself as a merged combination
-	 * of classifier'n'action model.
-	 */
-
-	if (skb->mark == 0xcafe) {
-		act_update_drop_map();
-		return TC_ACT_SHOT;
-	}
-
-	/* Default configured tc opcode. */
-	return TC_ACT_UNSPEC;
-}
-
-__section("action-rand")
-int act_rand_main(struct __sk_buff *skb)
-{
-	/* Sorry, we're near event horizon ... */
-	if ((get_prandom_u32() & 3) == 0) {
-		act_update_drop_map();
-		return TC_ACT_SHOT;
-	}
-
-	return TC_ACT_UNSPEC;
-}
-
-/* Last but not least, the file contains a license. Some future helper
- * functions may only be available with a GPL license.
- */
-BPF_LICENSE("GPL");
diff --git a/examples/bpf/bpf_shared.h b/examples/bpf/bpf_shared.h
deleted file mode 100644
index a24038dd..00000000
--- a/examples/bpf/bpf_shared.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef __BPF_SHARED__
-#define __BPF_SHARED__
-
-enum {
-	BPF_MAP_ID_PROTO,
-	BPF_MAP_ID_QUEUE,
-	BPF_MAP_ID_DROPS,
-	__BPF_MAP_ID_MAX,
-#define BPF_MAP_ID_MAX	__BPF_MAP_ID_MAX
-};
-
-struct count_tuple {
-	long packets; /* type long for lock_xadd() */
-	long bytes;
-};
-
-struct count_queue {
-	long total;
-	long mismatch;
-};
-
-#endif /* __BPF_SHARED__ */
diff --git a/examples/bpf/bpf_sys.h b/examples/bpf/bpf_sys.h
deleted file mode 100644
index 6e4f09e2..00000000
--- a/examples/bpf/bpf_sys.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef __BPF_SYS__
-#define __BPF_SYS__
-
-#include <sys/syscall.h>
-#include <linux/bpf.h>
-
-static inline __u64 bpf_ptr_to_u64(const void *ptr)
-{
-	return (__u64) (unsigned long) ptr;
-}
-
-static inline int bpf_lookup_elem(int fd, void *key, void *value)
-{
-	union bpf_attr attr = {
-		.map_fd		= fd,
-		.key		= bpf_ptr_to_u64(key),
-		.value		= bpf_ptr_to_u64(value),
-	};
-
-	return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
-}
-
-#endif /* __BPF_SYS__ */

From 612ff099a11092a6ed4084b84a8810155884baca Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 17 Jul 2017 17:18:51 +0200
Subject: [PATCH 5/7] bpf: support loading map in map from obj

Add support for map in map in the loader and add a small example program.
The outer map uses inner_id to reference a bpf_elf_map with a given ID
as the inner type. Loading maps is done in three passes, i) all non-map
in map maps are loaded, ii) all map in map maps are loaded based on the
inner_id map spec of a non-map in map with corresponding id, and iii)
related inner maps are attached to the map in map with given inner_idx
key. Pinned objetcs are assumed to be managed externally, so they are
only retrieved from BPF fs.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 examples/bpf/bpf_map_in_map.c |  56 ++++++++++++
 include/bpf_elf.h             |   2 +
 lib/bpf.c                     | 159 +++++++++++++++++++++++++++++++---
 3 files changed, 206 insertions(+), 11 deletions(-)
 create mode 100644 examples/bpf/bpf_map_in_map.c

diff --git a/examples/bpf/bpf_map_in_map.c b/examples/bpf/bpf_map_in_map.c
new file mode 100644
index 00000000..ff0e623a
--- /dev/null
+++ b/examples/bpf/bpf_map_in_map.c
@@ -0,0 +1,56 @@
+#include "../../include/bpf_api.h"
+
+#define MAP_INNER_ID	42
+
+struct bpf_elf_map __section_maps map_inner = {
+	.type		= BPF_MAP_TYPE_ARRAY,
+	.size_key	= sizeof(uint32_t),
+	.size_value	= sizeof(uint32_t),
+	.id		= MAP_INNER_ID,
+	.inner_idx	= 0,
+	.pinning	= PIN_GLOBAL_NS,
+	.max_elem	= 1,
+};
+
+struct bpf_elf_map __section_maps map_outer = {
+	.type		= BPF_MAP_TYPE_ARRAY_OF_MAPS,
+	.size_key	= sizeof(uint32_t),
+	.size_value	= sizeof(uint32_t),
+	.inner_id	= MAP_INNER_ID,
+	.pinning	= PIN_GLOBAL_NS,
+	.max_elem	= 1,
+};
+
+__section("egress")
+int emain(struct __sk_buff *skb)
+{
+	struct bpf_elf_map *map_inner;
+	int key = 0, *val;
+
+	map_inner = map_lookup_elem(&map_outer, &key);
+	if (map_inner) {
+		val = map_lookup_elem(map_inner, &key);
+		if (val)
+			lock_xadd(val, 1);
+	}
+
+	return BPF_H_DEFAULT;
+}
+
+__section("ingress")
+int imain(struct __sk_buff *skb)
+{
+	struct bpf_elf_map *map_inner;
+	int key = 0, *val;
+
+	map_inner = map_lookup_elem(&map_outer, &key);
+	if (map_inner) {
+		val = map_lookup_elem(map_inner, &key);
+		if (val)
+			printt("map val: %d\n", *val);
+	}
+
+	return BPF_H_DEFAULT;
+}
+
+BPF_LICENSE("GPL");
diff --git a/include/bpf_elf.h b/include/bpf_elf.h
index 239a0f36..406c3087 100644
--- a/include/bpf_elf.h
+++ b/include/bpf_elf.h
@@ -36,6 +36,8 @@ struct bpf_elf_map {
 	__u32 flags;
 	__u32 id;
 	__u32 pinning;
+	__u32 inner_id;
+	__u32 inner_idx;
 };
 
 #endif /* __BPF_ELF__ */
diff --git a/lib/bpf.c b/lib/bpf.c
index 6b5a96d0..45747d23 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -1023,15 +1023,16 @@ static int bpf_log_realloc(struct bpf_elf_ctx *ctx)
 
 static int bpf_map_create(enum bpf_map_type type, uint32_t size_key,
 			  uint32_t size_value, uint32_t max_elem,
-			  uint32_t flags)
+			  uint32_t flags, int inner_fd)
 {
 	union bpf_attr attr = {};
 
 	attr.map_type = type;
 	attr.key_size = size_key;
-	attr.value_size = size_value;
+	attr.value_size = inner_fd ? sizeof(int) : size_value;
 	attr.max_entries = max_elem;
 	attr.map_flags = flags;
+	attr.inner_map_fd = inner_fd;
 
 	return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
 }
@@ -1343,7 +1344,7 @@ retry:
 
 static void bpf_map_report(int fd, const char *name,
 			   const struct bpf_elf_map *map,
-			   struct bpf_elf_ctx *ctx)
+			   struct bpf_elf_ctx *ctx, int inner_fd)
 {
 	fprintf(stderr, "Map object \'%s\' %s%s (%d)!\n", name,
 		fd < 0 ? "rejected: " : "loaded",
@@ -1354,15 +1355,91 @@ static void bpf_map_report(int fd, const char *name,
 	fprintf(stderr, " - Identifier:   %u\n", map->id);
 	fprintf(stderr, " - Pinning:      %u\n", map->pinning);
 	fprintf(stderr, " - Size key:     %u\n", map->size_key);
-	fprintf(stderr, " - Size value:   %u\n", map->size_value);
+	fprintf(stderr, " - Size value:   %u\n",
+		inner_fd ? (int)sizeof(int) : map->size_value);
 	fprintf(stderr, " - Max elems:    %u\n", map->max_elem);
 	fprintf(stderr, " - Flags:        %#x\n\n", map->flags);
 }
 
-static int bpf_map_attach(const char *name, const struct bpf_elf_map *map,
-			  struct bpf_elf_ctx *ctx)
+static int bpf_find_map_id(const struct bpf_elf_ctx *ctx, uint32_t id)
 {
-	int fd, ret;
+	int i;
+
+	for (i = 0; i < ctx->map_num; i++) {
+		if (ctx->maps[i].id != id)
+			continue;
+		if (ctx->map_fds[i] < 0)
+			return -EINVAL;
+
+		return ctx->map_fds[i];
+	}
+
+	return -ENOENT;
+}
+
+static int bpf_derive_elf_map_from_fdinfo(int fd, struct bpf_elf_map *map)
+{
+	char file[PATH_MAX], buff[4096];
+	unsigned int val;
+	FILE *fp;
+
+	snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd);
+
+	memset(map, 0, sizeof(*map));
+
+	fp = fopen(file, "r");
+	if (!fp) {
+		fprintf(stderr, "No procfs support?!\n");
+		return -EIO;
+	}
+
+	while (fgets(buff, sizeof(buff), fp)) {
+		if (sscanf(buff, "map_type:\t%u", &val) == 1)
+			map->type = val;
+		else if (sscanf(buff, "key_size:\t%u", &val) == 1)
+			map->size_key = val;
+		else if (sscanf(buff, "value_size:\t%u", &val) == 1)
+			map->size_value = val;
+		else if (sscanf(buff, "max_entries:\t%u", &val) == 1)
+			map->max_elem = val;
+		else if (sscanf(buff, "map_flags:\t%i", &val) == 1)
+			map->flags = val;
+	}
+
+	fclose(fp);
+	return 0;
+}
+
+static void bpf_report_map_in_map(int outer_fd, int inner_fd, uint32_t idx)
+{
+	struct bpf_elf_map outer_map;
+	int ret;
+
+	fprintf(stderr, "Cannot insert map into map! ");
+
+	ret = bpf_derive_elf_map_from_fdinfo(outer_fd, &outer_map);
+	if (!ret) {
+		if (idx >= outer_map.max_elem &&
+		    outer_map.type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
+			fprintf(stderr, "Outer map has %u elements, index %u is invalid!\n",
+				outer_map.max_elem, idx);
+			return;
+		}
+	}
+
+	fprintf(stderr, "Different map specs used for outer and inner map?\n");
+}
+
+static bool bpf_is_map_in_map_type(const struct bpf_elf_map *map)
+{
+	return map->type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
+	       map->type == BPF_MAP_TYPE_HASH_OF_MAPS;
+}
+
+static int bpf_map_attach(const char *name, const struct bpf_elf_map *map,
+			  struct bpf_elf_ctx *ctx, int *have_map_in_map)
+{
+	int fd, ret, map_inner_fd = 0;
 
 	fd = bpf_probe_pinned(name, ctx, map->pinning);
 	if (fd > 0) {
@@ -1381,11 +1458,29 @@ static int bpf_map_attach(const char *name, const struct bpf_elf_map *map,
 		return fd;
 	}
 
+	if (have_map_in_map && bpf_is_map_in_map_type(map)) {
+		(*have_map_in_map)++;
+		if (map->inner_id)
+			return 0;
+		fprintf(stderr, "Map \'%s\' cannot be created since no inner map ID defined!\n",
+			name);
+		return -EINVAL;
+	}
+
+	if (!have_map_in_map && bpf_is_map_in_map_type(map)) {
+		map_inner_fd = bpf_find_map_id(ctx, map->inner_id);
+		if (map_inner_fd < 0) {
+			fprintf(stderr, "Map \'%s\' cannot be loaded. Inner map with ID %u not found!\n",
+				name, map->inner_id);
+			return -EINVAL;
+		}
+	}
+
 	errno = 0;
 	fd = bpf_map_create(map->type, map->size_key, map->size_value,
-			    map->max_elem, map->flags);
+			    map->max_elem, map->flags, map_inner_fd);
 	if (fd < 0 || ctx->verbose) {
-		bpf_map_report(fd, name, map, ctx);
+		bpf_map_report(fd, name, map, ctx, map_inner_fd);
 		if (fd < 0)
 			return fd;
 	}
@@ -1430,21 +1525,63 @@ static const char *bpf_map_fetch_name(struct bpf_elf_ctx *ctx, int which)
 
 static int bpf_maps_attach_all(struct bpf_elf_ctx *ctx)
 {
+	int i, j, ret, fd, inner_fd, inner_idx, have_map_in_map = 0;
 	const char *map_name;
-	int i, fd;
 
 	for (i = 0; i < ctx->map_num; i++) {
 		map_name = bpf_map_fetch_name(ctx, i);
 		if (!map_name)
 			return -EIO;
 
-		fd = bpf_map_attach(map_name, &ctx->maps[i], ctx);
+		fd = bpf_map_attach(map_name, &ctx->maps[i], ctx,
+				    &have_map_in_map);
+		if (fd < 0)
+			return fd;
+
+		ctx->map_fds[i] = !fd ? -1 : fd;
+	}
+
+	for (i = 0; have_map_in_map && i < ctx->map_num; i++) {
+		if (ctx->map_fds[i] >= 0)
+			continue;
+
+		map_name = bpf_map_fetch_name(ctx, i);
+		if (!map_name)
+			return -EIO;
+
+		fd = bpf_map_attach(map_name, &ctx->maps[i], ctx,
+				    NULL);
 		if (fd < 0)
 			return fd;
 
 		ctx->map_fds[i] = fd;
 	}
 
+	for (i = 0; have_map_in_map && i < ctx->map_num; i++) {
+		if (!ctx->maps[i].id ||
+		    ctx->maps[i].inner_id ||
+		    ctx->maps[i].inner_idx == -1)
+			continue;
+
+		inner_fd  = ctx->map_fds[i];
+		inner_idx = ctx->maps[i].inner_idx;
+
+		for (j = 0; j < ctx->map_num; j++) {
+			if (!bpf_is_map_in_map_type(&ctx->maps[j]))
+				continue;
+			if (ctx->maps[j].inner_id != ctx->maps[i].id)
+				continue;
+
+			ret = bpf_map_update(ctx->map_fds[j], &inner_idx,
+					     &inner_fd, BPF_ANY);
+			if (ret < 0) {
+				bpf_report_map_in_map(ctx->map_fds[j],
+						      inner_fd, inner_idx);
+				return ret;
+			}
+		}
+	}
+
 	return 0;
 }
 

From 779525cd77b8820d5f6d1b1f92d8d3d0038f858c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 17 Jul 2017 17:18:52 +0200
Subject: [PATCH 6/7] bpf: dump id/jited info for cls/act programs

Make use of TCA_BPF_ID/TCA_ACT_BPF_ID that we exposed and print the ID
of the programs loaded and use the new BPF_OBJ_GET_INFO_BY_FD command
for dumping further information about the program, currently whether
the attached program is jited.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/bpf_util.h |  2 ++
 lib/bpf.c          | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 tc/f_bpf.c         |  3 +++
 tc/m_bpf.c         |  3 +++
 4 files changed, 56 insertions(+)

diff --git a/include/bpf_util.h b/include/bpf_util.h
index 5361dab1..6582ec8c 100644
--- a/include/bpf_util.h
+++ b/include/bpf_util.h
@@ -261,6 +261,8 @@ int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns,
 int bpf_prog_attach_fd(int prog_fd, int target_fd, enum bpf_attach_type type);
 int bpf_prog_detach_fd(int target_fd, enum bpf_attach_type type);
 
+void bpf_dump_prog_info(FILE *f, uint32_t id);
+
 #ifdef HAVE_ELF
 int bpf_send_map_fds(const char *path, const char *obj);
 int bpf_recv_map_fds(const char *path, int *fds, struct bpf_map_aux *aux,
diff --git a/lib/bpf.c b/lib/bpf.c
index 45747d23..7eb5cd96 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -152,6 +152,54 @@ static int bpf_map_update(int fd, const void *key, const void *value,
 	return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
 }
 
+static int bpf_prog_fd_by_id(uint32_t id)
+{
+	union bpf_attr attr = {};
+
+	attr.prog_id = id;
+
+	return bpf(BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr));
+}
+
+static int bpf_prog_info_by_fd(int fd, struct bpf_prog_info *info,
+			       uint32_t *info_len)
+{
+	union bpf_attr attr = {};
+	int ret;
+
+	attr.info.bpf_fd = fd;
+	attr.info.info = bpf_ptr_to_u64(info);
+	attr.info.info_len = *info_len;
+
+	*info_len = 0;
+	ret = bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
+	if (!ret)
+		*info_len = attr.info.info_len;
+
+	return ret;
+}
+
+void bpf_dump_prog_info(FILE *f, uint32_t id)
+{
+	struct bpf_prog_info info = {};
+	uint32_t len = sizeof(info);
+	int fd, ret;
+
+	fprintf(f, "id %u ", id);
+
+	fd = bpf_prog_fd_by_id(id);
+	if (fd < 0)
+		return;
+
+	ret = bpf_prog_info_by_fd(fd, &info, &len);
+	if (!ret && len) {
+		if (info.jited_prog_len)
+			fprintf(f, "jited ");
+	}
+
+	close(fd);
+}
+
 static int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len,
 			    char **bpf_string, bool *need_release,
 			    const char separator)
diff --git a/tc/f_bpf.c b/tc/f_bpf.c
index 75c44c06..2f8d12a6 100644
--- a/tc/f_bpf.c
+++ b/tc/f_bpf.c
@@ -230,6 +230,9 @@ static int bpf_print_opt(struct filter_util *qu, FILE *f,
 				      b, sizeof(b)));
 	}
 
+	if (tb[TCA_BPF_ID])
+		bpf_dump_prog_info(f, rta_getattr_u32(tb[TCA_BPF_ID]));
+
 	if (tb[TCA_BPF_POLICE]) {
 		fprintf(f, "\n");
 		tc_print_police(f, tb[TCA_BPF_POLICE]);
diff --git a/tc/m_bpf.c b/tc/m_bpf.c
index 57283030..df559bcc 100644
--- a/tc/m_bpf.c
+++ b/tc/m_bpf.c
@@ -186,6 +186,9 @@ static int bpf_print_opt(struct action_util *au, FILE *f, struct rtattr *arg)
 				      b, sizeof(b)));
 	}
 
+        if (tb[TCA_ACT_BPF_ID])
+                bpf_dump_prog_info(f, rta_getattr_u32(tb[TCA_ACT_BPF_ID]));
+
 	print_action_control(f, "default-action ", parm->action, "\n");
 	fprintf(f, "\tindex %u ref %d bind %d", parm->index, parm->refcnt,
 		parm->bindcnt);

From b09515553fded944713955815a3f1cc855384abd Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Fri, 7 Jul 2017 15:08:33 +0200
Subject: [PATCH 7/7] tc: fix typo in manpage

Fix a typo in the 'tc' manpage and reword some sentences.

Signed-off-by: Matteo Croce <mcroce@redhat.com>
---
 man/man8/tc-csum.8 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/man/man8/tc-csum.8 b/man/man8/tc-csum.8
index 718301de..409ab717 100644
--- a/man/man8/tc-csum.8
+++ b/man/man8/tc-csum.8
@@ -29,9 +29,9 @@ csum - checksum update action
 The
 .B csum
 action triggers checksum recalculation of specified packet headers. It is
-commonly used after packet editing using the
+commonly used to fix incorrect checksums after the
 .B pedit
-action to fix for then incorrect checksums.
+action has modified the packet content.
 .SH OPTIONS
 .TP
 .I TARGET