From b133392468d1f404077a8f3554d1f63d48bb45e8 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 20 Jun 2018 10:24:21 +0300 Subject: [PATCH 01/13] tc: fix batch force option When sending accumulated compound command results an error, check 'force' option before exiting. Move return code check after putting batch bufs and freeing iovs to prevent memory leak. Break from loop, instead of returning error code to allow cleanup at the end of batch function. Don't reset ret code on each iteration. Fixes: 485d0c6001c4 ("tc: Add batchsize feature for filter and actions") Reviewed-by: Roi Dayan Reviewed-by: Chris Mi Signed-off-by: Vlad Buslov Signed-off-by: Stephen Hemminger --- tc/tc.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/tc/tc.c b/tc/tc.c index 0d223281..62d54186 100644 --- a/tc/tc.c +++ b/tc/tc.c @@ -331,6 +331,7 @@ static int batch(const char *name) int batchsize = 0; size_t len = 0; int ret = 0; + int err; bool send; batch_mode = 1; @@ -399,9 +400,9 @@ static int batch(const char *name) continue; /* blank line */ } - ret = do_cmd(largc, largv, tail == NULL ? NULL : tail->buf, + err = do_cmd(largc, largv, tail == NULL ? NULL : tail->buf, tail == NULL ? 0 : sizeof(tail->buf)); - if (ret != 0) { + if (err != 0) { fprintf(stderr, "Command failed %s:%d\n", name, cmdlineno - 1); ret = 1; @@ -423,15 +424,17 @@ static int batch(const char *name) iov->iov_len = n->nlmsg_len; } - ret = rtnl_talk_iov(&rth, iovs, batchsize, NULL); - if (ret < 0) { - fprintf(stderr, "Command failed %s:%d\n", name, - cmdlineno - (batchsize + ret) - 1); - return 2; - } + err = rtnl_talk_iov(&rth, iovs, batchsize, NULL); put_batch_bufs(&buf_pool, &head, &tail); - batchsize = 0; free(iovs); + if (err < 0) { + fprintf(stderr, "Command failed %s:%d\n", name, + cmdlineno - (batchsize + err) - 1); + ret = 1; + if (!force) + break; + } + batchsize = 0; } } while (!lastline); From a85f921ae5eb6cccb2a38919f79424bf7f111b6e Mon Sep 17 00:00:00 2001 From: "Eric S. Raymond" Date: Wed, 13 Jun 2018 17:31:12 -0400 Subject: [PATCH 02/13] devlink.8, translate unparseable callout syntax to parseable form. Signed-off-by: Eric S. Raymond Signed-off-by: Stephen Hemminger --- man/man8/devlink.8 | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/man/man8/devlink.8 b/man/man8/devlink.8 index 7986310f..efc6e625 100644 --- a/man/man8/devlink.8 +++ b/man/man8/devlink.8 @@ -7,7 +7,7 @@ devlink \- Devlink tool .in +8 .ti -8 .B devlink -.RI "[ " OPTIONS " ] " OBJECT " { " COMMAND " | " +.RI "[ " OPTIONS " ] { " dev | port | monitor | sb | resource " } { " COMMAND " | " .BR help " }" .sp @@ -17,18 +17,6 @@ devlink \- Devlink tool .BI "-batch " filename .sp -.ti -8 -.IR OBJECT " := { " -.BR dev " | " port " | " monitor " | " sb " | " resource " }" -.sp - -.ti -8 -.IR OPTIONS " := { " -\fB\-V\fR[\fIersion\fR] | -\fB\-n\fR[\fIno-nice-names\fR] } -\fB\-j\fR[\fIjson\fR] } -\fB\-p\fR[\fIpretty\fR] } - .SH OPTIONS .TP From dc3ef235f388c8d785a9976e379d264ab4e7b23e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sat, 7 Jul 2018 09:56:14 -0700 Subject: [PATCH 03/13] uapi: update bpf.h Signed-off-by: Stephen Hemminger --- include/uapi/linux/bpf.h | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index db4620a9..57e73908 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1857,7 +1857,8 @@ union bpf_attr { * is resolved), the nexthop address is returned in ipv4_dst * or ipv6_dst based on family, smac is set to mac address of * egress device, dmac is set to nexthop mac address, rt_metric - * is set to metric from route (IPv4/IPv6 only). + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. * * *plen* argument is the size of the passed in struct. * *flags* argument can be a combination of one or more of the @@ -1873,9 +1874,10 @@ union bpf_attr { * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. * Return - * Egress device index on success, 0 if packet needs to continue - * up the stack for further processing or a negative error in case - * of failure. + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * * packet is not forwarded or needs assist from full stack * * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) * Description @@ -2612,6 +2614,18 @@ struct bpf_raw_tracepoint_args { #define BPF_FIB_LOOKUP_DIRECT BIT(0) #define BPF_FIB_LOOKUP_OUTPUT BIT(1) +enum { + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ + BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ +}; + struct bpf_fib_lookup { /* input: network family for lookup (AF_INET, AF_INET6) * output: network family of egress nexthop @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup { /* total length of packet from network header - used for MTU check */ __u16 tot_len; - __u32 ifindex; /* L3 device index for lookup */ + + /* input: L3 device index for lookup + * output: device index from FIB lookup + */ + __u32 ifindex; union { /* inputs to lookup */ From 425dcc2741fa65d338f24de0606eb5c724658455 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 3 Jul 2018 15:54:32 +0300 Subject: [PATCH 04/13] tc: Fix output of ip attributes Example output is of tos and ttl. Befoe this fix the format used %x caused output of the pointer instead of the intended string created in the out variable. Fixes: e28b88a464c4 ("tc: jsonify flower filter") Signed-off-by: Roi Dayan Signed-off-by: Stephen Hemminger --- tc/f_flower.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tc/f_flower.c b/tc/f_flower.c index c7107651..1dfd57d2 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -1134,7 +1134,7 @@ static void flower_print_ip_attr(char *name, struct rtattr *key_attr, if (mask_attr) sprintf(out + done, "/%x", rta_getattr_u8(mask_attr)); - sprintf(namefrm, "\n %s %%x", name); + sprintf(namefrm, "\n %s %%s", name); print_string(PRINT_ANY, name, namefrm, out); } From d529ea2ff417eed1d48c580e099388aaace16ce9 Mon Sep 17 00:00:00 2001 From: fumihiko kakuma Date: Wed, 4 Jul 2018 12:32:33 +0900 Subject: [PATCH 05/13] tc: Fix the bug not to display prio and quantum options of htb A commandline like 'tc -d class show dev dev-name' does not display value of prio and quantum option when we use htb qdisc. This patch fixes the bug. Signed-off-by: Fumihiko Kakuma Acked-by: Cong Wang Signed-off-by: Stephen Hemminger --- tc/q_htb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tc/q_htb.c b/tc/q_htb.c index 7d5f6ce4..b93d31d4 100644 --- a/tc/q_htb.c +++ b/tc/q_htb.c @@ -291,9 +291,9 @@ static int htb_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) if (RTA_PAYLOAD(tb[TCA_HTB_PARMS]) < sizeof(*hopt)) return -1; if (!hopt->level) { - print_int(PRINT_ANY, "prio", "prio ", (int)hopt->prio); + print_int(PRINT_ANY, "prio", "prio %d ", (int)hopt->prio); if (show_details) - print_int(PRINT_ANY, "quantum", "quantum ", + print_int(PRINT_ANY, "quantum", "quantum %d ", (int)hopt->quantum); } From 4df5bb1be0a346c63172a6ca974a89b4df5cd606 Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Thu, 5 Jul 2018 08:20:09 -0700 Subject: [PATCH 06/13] man: Fix typos on tc-cbs Fix 2 typos on the man page of the CBS qdisc. Signed-off-by: Jesus Sanchez-Palencia Reviewed-by: Simon Horman Signed-off-by: Stephen Hemminger --- man/man8/tc-cbs.8 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/man8/tc-cbs.8 b/man/man8/tc-cbs.8 index 32e1e0d4..ad1d8821 100644 --- a/man/man8/tc-cbs.8 +++ b/man/man8/tc-cbs.8 @@ -28,7 +28,7 @@ defined rate limiting method to the traffic. This queueing discipline is intended to be used by TSN (Time Sensitive Networking) applications, the CBS parameters are derived directly by what is described by the Annex L of the IEEE 802.1Q-2014 -Sepcification. The algorithm and how it affects the latency are +Specification. The algorithm and how it affects the latency are detailed there. CBS is meant to be installed under another qdisc that maps packet @@ -60,7 +60,7 @@ packet size, which is then used for calculating the idleslope. sendslope Sendslope is the rate of credits that is depleted (it should be a negative number of kilobits per second) when a transmission is -ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section +occurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section 8.6.8.2 item g): sendslope = idleslope - port_transmit_rate From b49759c0e7c11cef57274b1023ec01c47da69c6c Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 9 Jul 2018 09:53:45 -0700 Subject: [PATCH 07/13] tc: don't double print rate Conversion to print stats in JSON forgot to remove existing fprintf. Fixes: 4fcec7f3665b ("tc: jsonify stats2") Signed-off-by: Stephen Hemminger --- tc/tc_util.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tc/tc_util.c b/tc/tc_util.c index e0c96291..d7578528 100644 --- a/tc/tc_util.c +++ b/tc/tc_util.c @@ -842,8 +842,6 @@ void print_tcstats2_attr(FILE *fp, struct rtattr *rta, char *prefix, struct rtat memcpy(&re, RTA_DATA(tbs[TCA_STATS_RATE_EST]), MIN(RTA_PAYLOAD(tbs[TCA_STATS_RATE_EST]), sizeof(re))); - fprintf(fp, "\n%srate %s %upps ", - prefix, sprint_rate(re.bps, b1), re.pps); print_string(PRINT_FP, NULL, "\n%s", prefix); print_uint(PRINT_JSON, "rate", NULL, re.bps); print_string(PRINT_FP, NULL, "rate %s", From 8df708afd62e4c3f7c79108d59f1ffa90123e9be Mon Sep 17 00:00:00 2001 From: Serhey Popovych Date: Sun, 15 Jul 2018 00:36:34 +0300 Subject: [PATCH 08/13] ipaddress: Fix and make consistent label match handling Since commit 9516823051ce ("ipaddress: Improve print_linkinfo()") we return -1 instead of 0 when ip-address(8) label does not match network device name as we did before change. This causes regression when trying to output ip address matching label: # ip addr add 192.168.192.1/24 dev lo label lo:1 # ip addr show label lo:1 This is special case and return 0 from print_linkinfo() earlier to match only filter.ifindex and filter.up if given, but not rest fields in @filter. Then call print_selected_addrinfo() without calling print_link_stats() in ipaddr_list_flush_or_save(). Later print_selected_addrinfo() calls print_addrinfo() that finally matches IFA_LABEL attribute in netlink buffer with filter.label using ifa_label_match_rta(). On the other hand there is three conditions checked in print_linkinfo() to determine label special case: 1) filter.label != NULL 2) filter.family == AF_UNSPEC || filter.family == AF_PACKET 3) fnmatch(filter.label, name, 0) With 1) it is ok to check if filtering by label is on by given pattern in @filter.label. Since label is IPv4 specific and AF_PACKET is for printing ip-link(8) information (see ipaddr_link_list()::ipaddress.c as example) checking for AF_PACKET in 2) doesn't take much sense: better to defer these checks to print_addrinfo() determine valid combinations before calling ifa_label_match_rta() to finally match IFA_LABEL to pattern in filter.label. For 3) we have following call for test case: fnmatch(pattern, string, flags) -> fnmatch(filter.label, name, 0) -> fnmatch("lo:1", "lo", 0) == FNM_NOMATCH (1) or non-zero on error To support special case in print_linkinfo() for filtering by label we only need to check if label pattern is given in filter.label and return 0 to skip print_link_stats() in ipaddr_list_flush_or_save(): actual filtering will be done in print_addrinfo(). Before commit 9516823051ce ("ipaddress: Improve print_linkinfo()"): ------------------------------------------------------------------- $ ip addr sh label lo 1: lo: mtu 65536 qdisc noqueue state UNKNOWN \ group default qlen 1000 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ fnmatch("lo", "lo", 0) == 0 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever inet6 ::1/128 scope host valid_lft forever preferred_lft forever $ ip addr show label 'lo:*' inet 192.168.192.1/24 scope global lo:1 valid_lft forever preferred_lft forever $ ip addr sh label lo:1 inet 192.168.192.1/24 scope global lo:1 valid_lft forever preferred_lft forever $ ip -4 addr sh label lo:1 1: lo: mtu 65536 qdisc noqueue state UNKNOWN \ group default qlen 1000 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ filter.family == AF_INET inet 192.168.192.1/24 scope global lo:1 valid_lft forever preferred_lft forever After this change applied: -------------------------- $ ip/ip addr show label lo inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever inet6 ::1/128 scope host valid_lft forever preferred_lft forever $ ip/ip addr show label 'lo:*' inet 192.168.192.1/24 scope global lo:1 valid_lft forever preferred_lft forever $ ip/ip addr show label lo:1 inet 192.168.192.1/24 scope global lo:1 valid_lft forever preferred_lft forever $ ip/ip -4 addr show label lo:1 inet 192.168.192.1/24 scope global lo:1 valid_lft forever preferred_lft forever Note that we no longer show link information as we did previously: we are filtering by "label" pattern, not showing by "dev". Fixes: commit 9516823051ce ("ipaddress: Improve print_linkinfo()") Reported-by: Vincent Bernat Signed-off-by: Serhey Popovych Signed-off-by: Stephen Hemminger --- ip/ipaddress.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 5009bfe6..ea8211c1 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -837,10 +837,8 @@ int print_linkinfo(const struct sockaddr_nl *who, if (!name) return -1; - if (filter.label && - (!filter.family || filter.family == AF_PACKET) && - fnmatch(filter.label, name, 0)) - return -1; + if (filter.label) + return 0; if (tb[IFLA_GROUP]) { int group = rta_getattr_u32(tb[IFLA_GROUP]); From 04cb3c0d4386c6c50d8cdfb90043e2ac022404ac Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Tue, 17 Jul 2018 14:49:52 +0000 Subject: [PATCH 09/13] ip: add support for seg6local End.BPF action This patch adds support for the End.BPF action of the seg6local lightweight tunnel. Functions from the BPF lightweight tunnel are re-used in this patch. Example: $ ip -6 route add fc00::18 encap seg6local action End.BPF endpoint obj my_bpf.o sec my_func dev eth0 $ ip -6 route show fc00::18 fc00::18 encap seg6local action End.BPF endpoint my_bpf.o:[my_func] dev eth0 metric 1024 pref medium v2: - re-use of print_encap_bpf_prog instead of fprintf - introduction of "endpoint" keyword for more consistency with others parameters Signed-off-by: Mathieu Xhonneux Signed-off-by: Stephen Hemminger --- ip/iproute_lwtunnel.c | 154 +++++++++++++++++++++++------------------- lib/bpf.c | 5 ++ 2 files changed, 88 insertions(+), 71 deletions(-) diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c index 46a212c8..e6044811 100644 --- a/ip/iproute_lwtunnel.c +++ b/ip/iproute_lwtunnel.c @@ -177,6 +177,7 @@ static const char *seg6_action_names[SEG6_LOCAL_ACTION_MAX + 1] = { [SEG6_LOCAL_ACTION_END_S] = "End.S", [SEG6_LOCAL_ACTION_END_AS] = "End.AS", [SEG6_LOCAL_ACTION_END_AM] = "End.AM", + [SEG6_LOCAL_ACTION_END_BPF] = "End.BPF", }; static const char *format_action_type(int action) @@ -202,6 +203,27 @@ static int read_action_type(const char *name) return SEG6_LOCAL_ACTION_UNSPEC; } +static void print_encap_bpf_prog(FILE *fp, struct rtattr *encap, + const char *str) +{ + struct rtattr *tb[LWT_BPF_PROG_MAX+1]; + const char *progname = NULL; + + parse_rtattr_nested(tb, LWT_BPF_PROG_MAX, encap); + + if (tb[LWT_BPF_PROG_NAME]) + progname = rta_getattr_str(tb[LWT_BPF_PROG_NAME]); + + if (is_json_context()) + print_string(PRINT_JSON, str, NULL, + progname ? : ""); + else { + fprintf(fp, "%s ", str); + if (progname) + fprintf(fp, "%s ", progname); + } +} + static void print_encap_seg6local(FILE *fp, struct rtattr *encap) { struct rtattr *tb[SEG6_LOCAL_MAX + 1]; @@ -250,6 +272,9 @@ static void print_encap_seg6local(FILE *fp, struct rtattr *encap) print_string(PRINT_ANY, "oif", "oif %s ", ll_index_to_name(oif)); } + + if (tb[SEG6_LOCAL_BPF]) + print_encap_bpf_prog(fp, tb[SEG6_LOCAL_BPF], "endpoint"); } static void print_encap_mpls(FILE *fp, struct rtattr *encap) @@ -356,27 +381,6 @@ static void print_encap_ip6(FILE *fp, struct rtattr *encap) "tc %u ", rta_getattr_u8(tb[LWTUNNEL_IP6_TC])); } -static void print_encap_bpf_prog(FILE *fp, struct rtattr *encap, - const char *str) -{ - struct rtattr *tb[LWT_BPF_PROG_MAX+1]; - const char *progname = NULL; - - parse_rtattr_nested(tb, LWT_BPF_PROG_MAX, encap); - - if (tb[LWT_BPF_PROG_NAME]) - progname = rta_getattr_str(tb[LWT_BPF_PROG_NAME]); - - if (is_json_context()) - print_string(PRINT_JSON, str, NULL, - progname ? : ""); - else { - fprintf(fp, "%s ", str); - if (progname) - fprintf(fp, "%s ", progname); - } -} - static void print_encap_bpf(FILE *fp, struct rtattr *encap) { struct rtattr *tb[LWT_BPF_MAX+1]; @@ -546,11 +550,60 @@ static int parse_encap_seg6(struct rtattr *rta, size_t len, int *argcp, return 0; } +struct lwt_x { + struct rtattr *rta; + size_t len; +}; + +static void bpf_lwt_cb(void *lwt_ptr, int fd, const char *annotation) +{ + struct lwt_x *x = lwt_ptr; + + rta_addattr32(x->rta, x->len, LWT_BPF_PROG_FD, fd); + rta_addattr_l(x->rta, x->len, LWT_BPF_PROG_NAME, annotation, + strlen(annotation) + 1); +} + +static const struct bpf_cfg_ops bpf_cb_ops = { + .ebpf_cb = bpf_lwt_cb, +}; + +static int lwt_parse_bpf(struct rtattr *rta, size_t len, + int *argcp, char ***argvp, + int attr, const enum bpf_prog_type bpf_type) +{ + struct bpf_cfg_in cfg = { + .type = bpf_type, + .argc = *argcp, + .argv = *argvp, + }; + struct lwt_x x = { + .rta = rta, + .len = len, + }; + struct rtattr *nest; + int err; + + nest = rta_nest(rta, len, attr); + err = bpf_parse_and_load_common(&cfg, &bpf_cb_ops, &x); + if (err < 0) { + fprintf(stderr, "Failed to parse eBPF program: %s\n", + strerror(-err)); + return -1; + } + rta_nest_end(rta, nest); + + *argcp = cfg.argc; + *argvp = cfg.argv; + + return 0; +} + static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp, char ***argvp) { int segs_ok = 0, hmac_ok = 0, table_ok = 0, nh4_ok = 0, nh6_ok = 0; - int iif_ok = 0, oif_ok = 0, action_ok = 0, srh_ok = 0; + int iif_ok = 0, oif_ok = 0, action_ok = 0, srh_ok = 0, bpf_ok = 0; __u32 action = 0, table, iif, oif; struct ipv6_sr_hdr *srh; char **argv = *argvp; @@ -627,6 +680,14 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp, } else { continue; } + } else if (strcmp(*argv, "endpoint") == 0) { + NEXT_ARG(); + if (bpf_ok++) + duparg2("endpoint", *argv); + + if (lwt_parse_bpf(rta, len, &argc, &argv, SEG6_LOCAL_BPF, + BPF_PROG_TYPE_LWT_SEG6LOCAL) < 0) + exit(-1); } else { break; } @@ -896,55 +957,6 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len, return 0; } -struct lwt_x { - struct rtattr *rta; - size_t len; -}; - -static void bpf_lwt_cb(void *lwt_ptr, int fd, const char *annotation) -{ - struct lwt_x *x = lwt_ptr; - - rta_addattr32(x->rta, x->len, LWT_BPF_PROG_FD, fd); - rta_addattr_l(x->rta, x->len, LWT_BPF_PROG_NAME, annotation, - strlen(annotation) + 1); -} - -static const struct bpf_cfg_ops bpf_cb_ops = { - .ebpf_cb = bpf_lwt_cb, -}; - -static int lwt_parse_bpf(struct rtattr *rta, size_t len, - int *argcp, char ***argvp, - int attr, const enum bpf_prog_type bpf_type) -{ - struct bpf_cfg_in cfg = { - .type = bpf_type, - .argc = *argcp, - .argv = *argvp, - }; - struct lwt_x x = { - .rta = rta, - .len = len, - }; - struct rtattr *nest; - int err; - - nest = rta_nest(rta, len, attr); - err = bpf_parse_and_load_common(&cfg, &bpf_cb_ops, &x); - if (err < 0) { - fprintf(stderr, "Failed to parse eBPF program: %s\n", - strerror(-err)); - return -1; - } - rta_nest_end(rta, nest); - - *argcp = cfg.argc; - *argvp = cfg.argv; - - return 0; -} - static void lwt_bpf_usage(void) { fprintf(stderr, "Usage: ip route ... encap bpf [ in BPF ] [ out BPF ] [ xmit BPF ] [...]\n"); diff --git a/lib/bpf.c b/lib/bpf.c index 4e26c0df..65e26989 100644 --- a/lib/bpf.c +++ b/lib/bpf.c @@ -95,6 +95,11 @@ static const struct bpf_prog_meta __bpf_prog_meta[] = { .subdir = "ip", .section = ELF_SECTION_PROG, }, + [BPF_PROG_TYPE_LWT_SEG6LOCAL] = { + .type = "lwt_seg6local", + .subdir = "ip", + .section = ELF_SECTION_PROG, + }, }; static bool bpf_map_offload_neutral(enum bpf_map_type type) From b625e3610893fb75cd7199e2752fd488e042dc6b Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Mon, 16 Jul 2018 10:52:18 -0700 Subject: [PATCH 10/13] tc: Do not use addattr_nest_compat on mqprio and netem Here we are partially reverting commit c14f9d92eee107 "treewide: Use addattr_nest()/addattr_nest_end() to handle nested attributes" . As discussed in [1], changing from the 'manually' coded version that used addattr_l() to addattr_nest_compat() wasn't functionally equivalent, because now the messages have extra fields appended to it. This introduced a regression since the implementation of parse_attr() from both mqprio and netem can't handle this new message format. Without this fix, mqprio returns an error. netem won't return an error but its internal configuration ends up wrong. As an example, this can be reproduced by the following commands when this patch is not applied: 1) mqprio $ tc qdisc replace dev enp3s0 parent root handle 100 mqprio \ num_tc 3 map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \ queues 1@0 1@1 2@2 hw 0 RTNETLINK answers: Numerical result out of range 2) netem $ tc qdisc add dev enp3s0 root netem rate 5kbit 20 100 5 \ distribution normal latency 1 1 $ tc -s qdisc (...) qdisc netem 8001: dev enp3s0 root refcnt 9 limit 1000 delay 0us 0us Sent 402 bytes 1 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 (...) With this patch applied, the tc -s qdisc command above for netem instead reads: (...) qdisc netem 8002: dev enp3s0 root refcnt 9 limit 1000 delay 0us 0us \ rate 5Kbit packetoverhead 20 cellsize 100 celloverhead 5 Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 (...) [1] https://patchwork.ozlabs.org/patch/867860/#1893405 Fixes: c14f9d92eee107 ("treewide: Use addattr_nest()/addattr_nest_end() to handle nested attributes") Reported-by: Vinicius Costa Gomes Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: Stephen Hemminger --- tc/q_mqprio.c | 5 +++-- tc/q_netem.c | 7 +++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tc/q_mqprio.c b/tc/q_mqprio.c index 207d6441..89b46002 100644 --- a/tc/q_mqprio.c +++ b/tc/q_mqprio.c @@ -173,7 +173,8 @@ static int mqprio_parse_opt(struct qdisc_util *qu, int argc, argc--; argv++; } - tail = addattr_nest_compat(n, 1024, TCA_OPTIONS, &opt, sizeof(opt)); + tail = NLMSG_TAIL(n); + addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt)); if (flags & TC_MQPRIO_F_MODE) addattr_l(n, 1024, TCA_MQPRIO_MODE, @@ -208,7 +209,7 @@ static int mqprio_parse_opt(struct qdisc_util *qu, int argc, addattr_nest_end(n, start); } - addattr_nest_compat_end(n, tail); + tail->rta_len = (void *)NLMSG_TAIL(n) - (void *)tail; return 0; } diff --git a/tc/q_netem.c b/tc/q_netem.c index 623ec903..9f9a9b3d 100644 --- a/tc/q_netem.c +++ b/tc/q_netem.c @@ -422,6 +422,8 @@ static int netem_parse_opt(struct qdisc_util *qu, int argc, char **argv, } } + tail = NLMSG_TAIL(n); + if (reorder.probability) { if (opt.latency == 0) { fprintf(stderr, "reordering not possible without specifying some delay\n"); @@ -450,7 +452,8 @@ static int netem_parse_opt(struct qdisc_util *qu, int argc, char **argv, return -1; } - tail = addattr_nest_compat(n, 1024, TCA_OPTIONS, &opt, sizeof(opt)); + if (addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt)) < 0) + return -1; if (present[TCA_NETEM_CORR] && addattr_l(n, 1024, TCA_NETEM_CORR, &cor, sizeof(cor)) < 0) @@ -509,7 +512,7 @@ static int netem_parse_opt(struct qdisc_util *qu, int argc, char **argv, return -1; free(dist_data); } - addattr_nest_compat_end(n, tail); + tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail; return 0; } From 7f57c8b726398835cb3f83f5bef02e3c2853d593 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Jul 2018 09:35:26 -0700 Subject: [PATCH 11/13] devlink: CTRL_ATTR_FAMILY_ID is a u16 CTRL_ATTR_FAMILY_ID is a u16, not a u32. Update devlink accordingly. Fixes: a3c4b484a1edd ("add devlink tool") Signed-off-by: David Ahern Signed-off-by: Stephen Hemminger --- devlink/mnlg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devlink/mnlg.c b/devlink/mnlg.c index c33c90be..37cc25dd 100644 --- a/devlink/mnlg.c +++ b/devlink/mnlg.c @@ -199,7 +199,7 @@ int mnlg_socket_group_add(struct mnlg_socket *nlg, const char *group_name) nlh = __mnlg_msg_prepare(nlg, CTRL_CMD_GETFAMILY, NLM_F_REQUEST | NLM_F_ACK, GENL_ID_CTRL, 1); - mnl_attr_put_u32(nlh, CTRL_ATTR_FAMILY_ID, nlg->id); + mnl_attr_put_u16(nlh, CTRL_ATTR_FAMILY_ID, nlg->id); err = mnlg_socket_send(nlg, nlh); if (err < 0) From 7c16a8da6b56d26e222abdda8083724230f0701a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 23 Jul 2018 13:47:22 -0700 Subject: [PATCH 12/13] uapi: fix tcp.h repair Upstream define for TCP_REPAIR changed. Signed-off-by: Stephen Hemminger --- include/uapi/linux/tcp.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 2e766cf3..99e329b7 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -127,6 +127,10 @@ enum { #define TCP_CM_INQ TCP_INQ +#define TCP_REPAIR_ON 1 +#define TCP_REPAIR_OFF 0 +#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ + struct tcp_repair_opt { __u32 opt_code; __u32 opt_val; From 7327f785653c8d19fc826cf487e000bd6712ae96 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 23 Jul 2018 13:49:20 -0700 Subject: [PATCH 13/13] rdam: uapi update ib_user_verbs.h Merge in latest santized kernel header. Put sanitized version of current ib_user_verbs.h. Signed-off-by: Stephen Hemminger --- rdma/include/uapi/rdma/ib_user_verbs.h | 63 ++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/rdma/include/uapi/rdma/ib_user_verbs.h b/rdma/include/uapi/rdma/ib_user_verbs.h index 6aeb0331..4f9991de 100644 --- a/rdma/include/uapi/rdma/ib_user_verbs.h +++ b/rdma/include/uapi/rdma/ib_user_verbs.h @@ -998,6 +998,19 @@ struct ib_uverbs_flow_spec_action_handle { __u32 reserved1; }; +struct ib_uverbs_flow_spec_action_count { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + __u32 handle; + __u32 reserved1; +}; + struct ib_uverbs_flow_tunnel_filter { __be32 tunnel_id; }; @@ -1033,6 +1046,56 @@ struct ib_uverbs_flow_spec_esp { struct ib_uverbs_flow_spec_esp_filter mask; }; +struct ib_uverbs_flow_gre_filter { + /* c_ks_res0_ver field is bits 0-15 in offset 0 of a standard GRE header: + * bit 0 - C - checksum bit. + * bit 1 - reserved. set to 0. + * bit 2 - key bit. + * bit 3 - sequence number bit. + * bits 4:12 - reserved. set to 0. + * bits 13:15 - GRE version. + */ + __be16 c_ks_res0_ver; + __be16 protocol; + __be32 key; +}; + +struct ib_uverbs_flow_spec_gre { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_gre_filter val; + struct ib_uverbs_flow_gre_filter mask; +}; + +struct ib_uverbs_flow_mpls_filter { + /* The field includes the entire MPLS label: + * bits 0:19 - label field. + * bits 20:22 - traffic class field. + * bits 23 - bottom of stack bit. + * bits 24:31 - ttl field. + */ + __be32 label; +}; + +struct ib_uverbs_flow_spec_mpls { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_mpls_filter val; + struct ib_uverbs_flow_mpls_filter mask; +}; + struct ib_uverbs_flow_attr { __u32 type; __u16 size;