From 3d65cefbefc86a53877f1e6461a9461e5b8fd7b3 Mon Sep 17 00:00:00 2001 From: wenxu Date: Wed, 2 Jan 2019 11:57:00 +0800 Subject: [PATCH 01/19] iproute: Set ip/ip6 lwtunnel flags ip l add dev tun type gretap external ip r a 10.0.0.1 encap ip dst 192.168.152.171 id 1000 dev gretap For gretap example when the command set the id but don't set the TUNNEL_KEY flags. There is no key field in the send packet User can set flags with key, csum, seq ip r a 10.0.0.1 encap ip dst 192.168.152.171 id 1000 key csum dev gretap Signed-off-by: wenxu Signed-off-by: Stephen Hemminger --- ip/iproute_lwtunnel.c | 58 +++++++++++++++++++++++++++++++++++++++++- man/man8/ip-route.8.in | 3 ++- 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c index aee18ac5..03217b8f 100644 --- a/ip/iproute_lwtunnel.c +++ b/ip/iproute_lwtunnel.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include static const char *format_encap_type(int type) { @@ -294,6 +294,7 @@ static void print_encap_mpls(FILE *fp, struct rtattr *encap) static void print_encap_ip(FILE *fp, struct rtattr *encap) { struct rtattr *tb[LWTUNNEL_IP_MAX+1]; + __u16 flags; parse_rtattr_nested(tb, LWTUNNEL_IP_MAX, encap); @@ -318,6 +319,16 @@ static void print_encap_ip(FILE *fp, struct rtattr *encap) if (tb[LWTUNNEL_IP_TOS]) print_uint(PRINT_ANY, "tos", "tos %d ", rta_getattr_u8(tb[LWTUNNEL_IP_TOS])); + + if (tb[LWTUNNEL_IP_FLAGS]) { + flags = rta_getattr_u16(tb[LWTUNNEL_IP_FLAGS]); + if (flags & TUNNEL_KEY) + print_bool(PRINT_ANY, "key", "key ", true); + if (flags & TUNNEL_CSUM) + print_bool(PRINT_ANY, "csum", "csum ", true); + if (flags & TUNNEL_SEQ) + print_bool(PRINT_ANY, "seq", "seq ", true); + } } static void print_encap_ila(FILE *fp, struct rtattr *encap) @@ -354,6 +365,7 @@ static void print_encap_ila(FILE *fp, struct rtattr *encap) static void print_encap_ip6(FILE *fp, struct rtattr *encap) { struct rtattr *tb[LWTUNNEL_IP6_MAX+1]; + __u16 flags; parse_rtattr_nested(tb, LWTUNNEL_IP6_MAX, encap); @@ -379,6 +391,16 @@ static void print_encap_ip6(FILE *fp, struct rtattr *encap) if (tb[LWTUNNEL_IP6_TC]) print_uint(PRINT_ANY, "tc", "tc %u ", rta_getattr_u8(tb[LWTUNNEL_IP6_TC])); + + if (tb[LWTUNNEL_IP6_FLAGS]) { + flags = rta_getattr_u16(tb[LWTUNNEL_IP6_FLAGS]); + if (flags & TUNNEL_KEY) + print_bool(PRINT_ANY, "key", "key ", true); + if (flags & TUNNEL_CSUM) + print_bool(PRINT_ANY, "csum", "csum ", true); + if (flags & TUNNEL_SEQ) + print_bool(PRINT_ANY, "seq", "seq ", true); + } } static void print_encap_bpf(FILE *fp, struct rtattr *encap) @@ -777,9 +799,11 @@ static int parse_encap_ip(struct rtattr *rta, size_t len, int *argcp, char ***argvp) { int id_ok = 0, dst_ok = 0, src_ok = 0, tos_ok = 0, ttl_ok = 0; + int key_ok = 0, csum_ok = 0, seq_ok = 0; char **argv = *argvp; int argc = *argcp; int ret = 0; + __u16 flags = 0; while (argc > 0) { if (strcmp(*argv, "id") == 0) { @@ -827,6 +851,18 @@ static int parse_encap_ip(struct rtattr *rta, size_t len, if (get_u8(&ttl, *argv, 0)) invarg("\"ttl\" value is invalid\n", *argv); ret = rta_addattr8(rta, len, LWTUNNEL_IP_TTL, ttl); + } else if (strcmp(*argv, "key") == 0) { + if (key_ok++) + duparg2("key", *argv); + flags |= TUNNEL_KEY; + } else if (strcmp(*argv, "csum") == 0) { + if (csum_ok++) + duparg2("csum", *argv); + flags |= TUNNEL_CSUM; + } else if (strcmp(*argv, "seq") == 0) { + if (seq_ok++) + duparg2("seq", *argv); + flags |= TUNNEL_SEQ; } else { break; } @@ -835,6 +871,9 @@ static int parse_encap_ip(struct rtattr *rta, size_t len, argc--; argv++; } + if (flags) + ret = rta_addattr16(rta, len, LWTUNNEL_IP_FLAGS, flags); + /* argv is currently the first unparsed argument, * but the lwt_parse_encap() caller will move to the next, * so step back @@ -927,9 +966,11 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len, int *argcp, char ***argvp) { int id_ok = 0, dst_ok = 0, src_ok = 0, tos_ok = 0, ttl_ok = 0; + int key_ok = 0, csum_ok = 0, seq_ok = 0; char **argv = *argvp; int argc = *argcp; int ret = 0; + __u16 flags = 0; while (argc > 0) { if (strcmp(*argv, "id") == 0) { @@ -979,6 +1020,18 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len, *argv); ret = rta_addattr8(rta, len, LWTUNNEL_IP6_HOPLIMIT, hoplimit); + } else if (strcmp(*argv, "key") == 0) { + if (key_ok++) + duparg2("key", *argv); + flags |= TUNNEL_KEY; + } else if (strcmp(*argv, "csum") == 0) { + if (csum_ok++) + duparg2("csum", *argv); + flags |= TUNNEL_CSUM; + } else if (strcmp(*argv, "seq") == 0) { + if (seq_ok++) + duparg2("seq", *argv); + flags |= TUNNEL_SEQ; } else { break; } @@ -987,6 +1040,9 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len, argc--; argv++; } + if (flags) + ret = rta_addattr16(rta, len, LWTUNNEL_IP6_FLAGS, flags); + /* argv is currently the first unparsed argument, * but the lwt_parse_encap() caller will move to the next, * so step back diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in index 9603ac6e..b9ae6e30 100644 --- a/man/man8/ip-route.8.in +++ b/man/man8/ip-route.8.in @@ -737,7 +737,8 @@ is a set of encapsulation attributes specific to the .B tos .IR TOS " ] [" .B ttl -.IR TTL " ]" +.IR TTL " ] [ " +.BR key " ] [" csum " ] [ " seq " ] " .in -2 .sp From 28747146622a49c3e7b5c5b36dc02c6a64124770 Mon Sep 17 00:00:00 2001 From: Hans Dedecker Date: Wed, 23 Jan 2019 22:02:31 +0100 Subject: [PATCH 02/19] f_flower: fix build with musl libc XATTR_SIZE_MAX requires the usage of linux/limits.h; let's include it Signed-off-by: Hans Dedecker Signed-off-by: Stephen Hemminger --- tc/f_flower.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tc/f_flower.c b/tc/f_flower.c index c5636667..9659e894 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include From 17ed56fdf3bc2c5511bb9fa2f1e4487a3db721c0 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Fri, 25 Jan 2019 10:37:07 +0000 Subject: [PATCH 03/19] libnetlink: linkdump_req: AF_PACKET family also expects ext_filter_mask Without this fix, the VF info can't be showed using command "ip link". 146: ens1f0: mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000 link/ether 24:8a:07:ad:78:52 brd ff:ff:ff:ff:ff:ff vf 0 MAC 02:25:d0:12:01:01, spoof checking off, link-state auto, trust off, query_rss off vf 1 MAC 02:25:d0:12:01:02, spoof checking off, link-state auto, trust off, query_rss off Fixes: d97b16b2c906 ("libnetlink: linkdump_req: Only AF_UNSPEC family expects an ext_filter_mask") Signed-off-by: Chris Mi Acked-by: David Ahern Signed-off-by: Stephen Hemminger --- lib/libnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 110f47bc..3beb4342 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -476,7 +476,7 @@ int rtnl_linkdump_req_filter(struct rtnl_handle *rth, int family, int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int family, req_filter_fn_t filter_fn) { - if (family == AF_UNSPEC) { + if (family == AF_UNSPEC || family == AF_PACKET) { struct { struct nlmsghdr nlh; struct ifinfomsg ifm; From 264be1d887102d47d725b299a1b74393259015dc Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 25 Jan 2019 17:09:17 +0000 Subject: [PATCH 04/19] bridge: fdb: Fix FDB dump with strict checking disabled While iproute2 correctly uses ifinfomsg struct as the ancillary header when requesting an FDB dump on old kernels, it sets the message type to RTM_GETLINK. This results in wrong reply being returned. Fix this by using RTM_GETNEIGH instead. Before: $ bridge fdb show brport dummy0 Not RTM_NEWNEIGH: 00000158 00000010 00000002 After: $ bridge fdb show brport dummy0 2a:0b:41:1c:92:d3 vlan 1 master br0 permanent 2a:0b:41:1c:92:d3 master br0 permanent 33:33:00:00:00:01 self permanent 01:00:5e:00:00:01 self permanent Fixes: 05880354c2cf ("bridge: fdb: Fix filtering with strict checking disabled") Signed-off-by: Ido Schimmel Reported-by: LiLiang Acked-by: David Ahern Acked-by: Ivan Vecera Signed-off-by: Stephen Hemminger --- bridge/fdb.c | 3 +-- include/libnetlink.h | 3 +++ lib/libnetlink.c | 23 +++++++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/bridge/fdb.c b/bridge/fdb.c index f75e953a..c4bf4039 100644 --- a/bridge/fdb.c +++ b/bridge/fdb.c @@ -346,8 +346,7 @@ static int fdb_show(int argc, char **argv) if (rth.flags & RTNL_HANDLE_F_STRICT_CHK) rc = rtnl_neighdump_req(&rth, PF_BRIDGE, fdb_dump_filter); else - rc = rtnl_linkdump_req_filter_fn(&rth, PF_BRIDGE, - fdb_linkdump_filter); + rc = rtnl_fdb_linkdump_req_filter_fn(&rth, fdb_linkdump_filter); if (rc < 0) { perror("Cannot send dump request"); exit(1); diff --git a/include/libnetlink.h b/include/libnetlink.h index 0854d6ad..503b3ec1 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -81,6 +81,9 @@ int rtnl_linkdump_req_filter(struct rtnl_handle *rth, int fam, __u32 filt_mask) int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int fam, req_filter_fn_t fn) __attribute__((warn_unused_result)); +int rtnl_fdb_linkdump_req_filter_fn(struct rtnl_handle *rth, + req_filter_fn_t filter_fn) + __attribute__((warn_unused_result)); int rtnl_statsdump_req_filter(struct rtnl_handle *rth, int fam, __u32 filt_mask) __attribute__((warn_unused_result)); int rtnl_dump_request(struct rtnl_handle *rth, int type, void *req, diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 3beb4342..1892a02a 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -503,6 +503,29 @@ int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int family, return __rtnl_linkdump_req(rth, family); } +int rtnl_fdb_linkdump_req_filter_fn(struct rtnl_handle *rth, + req_filter_fn_t filter_fn) +{ + struct { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + char buf[128]; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nlh.nlmsg_type = RTM_GETNEIGH, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .nlh.nlmsg_seq = rth->dump = ++rth->seq, + .ifm.ifi_family = PF_BRIDGE, + }; + int err; + + err = filter_fn(&req.nlh, sizeof(req)); + if (err) + return err; + + return send(rth->fd, &req, sizeof(req), 0); +} + int rtnl_statsdump_req_filter(struct rtnl_handle *rth, int fam, __u32 filt_mask) { struct { From 3da6d055d93fefe40bf88a9bc37b4ce3433696ee Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 24 Jan 2019 16:41:07 -0800 Subject: [PATCH 05/19] bpf: add btf func and func_proto kind support The issue is discovered for bpf selftest test_skb_cgroup.sh. Currently we have, $ ./test_skb_cgroup_id.sh Wait for testing link-local IP to become available ... OK Object has unknown BTF type: 13! [PASS] In the above the BTF type 13 refers to BTF kind BTF_KIND_FUNC_PROTO. This patch added support of BTF_KIND_FUNC_PROTO and BTF_KIND_FUNC during type parsing. With this patch, I got $ ./test_skb_cgroup_id.sh Wait for testing link-local IP to become available ... OK [PASS] Signed-off-by: Yonghong Song Acked-by: Daniel Borkmann Signed-off-by: Stephen Hemminger --- lib/bpf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/bpf.c b/lib/bpf.c index 5e85cfc0..762f8857 100644 --- a/lib/bpf.c +++ b/lib/bpf.c @@ -2193,12 +2193,16 @@ static int bpf_btf_prep_type_data(struct bpf_elf_ctx *ctx) case BTF_KIND_ENUM: type_cur += var_len * sizeof(struct btf_enum); break; + case BTF_KIND_FUNC_PROTO: + type_cur += var_len * sizeof(struct btf_param); + break; case BTF_KIND_TYPEDEF: case BTF_KIND_PTR: case BTF_KIND_FWD: case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: + case BTF_KIND_FUNC: break; default: fprintf(stderr, "Object has unknown BTF type: %u!\n", kind); From 2d603d55a8160aa40f0a442574f1fc8dedc9a034 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 6 Feb 2019 10:41:58 -0800 Subject: [PATCH 06/19] tc: fix memory leak in error path If value passed to parse_percent was not valid, it would leak the dynamic allocation from sscanf. Fixes: 927e3cfb52b5 ("tc: B.W limits can now be specified in %.") Signed-off-by: Stephen Hemminger --- tc/tc_util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tc/tc_util.c b/tc/tc_util.c index ab717890..1377b536 100644 --- a/tc/tc_util.c +++ b/tc/tc_util.c @@ -195,7 +195,7 @@ static int parse_percent_rate(char *rate, const char *str, const char *dev) long dev_mbit; int ret; double perc, rate_mbit; - char *str_perc; + char *str_perc = NULL; if (!dev[0]) { fprintf(stderr, "No device specified; specify device to rate limit by percentage\n"); @@ -230,6 +230,7 @@ static int parse_percent_rate(char *rate, const char *str, const char *dev) return 0; malf: + free(str_perc); fprintf(stderr, "Specified rate value could not be read or is malformed\n"); return -1; } From 817204d0b0ee98b0849902e5b20cc3e84460b900 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 6 Feb 2019 10:49:47 -0800 Subject: [PATCH 07/19] tc: avoid problems with hard coded rate string length The parse_percent_rate function assumed the buffer was 20 characters. Better to pass length in case the size ever changes. Signed-off-by: Stephen Hemminger --- tc/tc_util.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tc/tc_util.c b/tc/tc_util.c index 1377b536..4e289ae9 100644 --- a/tc/tc_util.c +++ b/tc/tc_util.c @@ -190,7 +190,8 @@ static const struct rate_suffix { { NULL } }; -static int parse_percent_rate(char *rate, const char *str, const char *dev) +static int parse_percent_rate(char *rate, size_t len, + const char *str, const char *dev) { long dev_mbit; int ret; @@ -221,8 +222,8 @@ static int parse_percent_rate(char *rate, const char *str, const char *dev) rate_mbit = perc * dev_mbit; - ret = snprintf(rate, 20, "%lf", rate_mbit); - if (ret <= 0 || ret >= 20) { + ret = snprintf(rate, len, "%lf", rate_mbit); + if (ret <= 0 || ret >= len) { fprintf(stderr, "Unable to parse calculated rate\n"); return -1; } @@ -239,7 +240,7 @@ int get_percent_rate(unsigned int *rate, const char *str, const char *dev) { char r_str[20]; - if (parse_percent_rate(r_str, str, dev)) + if (parse_percent_rate(r_str, sizeof(r_str), str, dev)) return -1; return get_rate(rate, r_str); @@ -249,7 +250,7 @@ int get_percent_rate64(__u64 *rate, const char *str, const char *dev) { char r_str[20]; - if (parse_percent_rate(r_str, str, dev)) + if (parse_percent_rate(r_str, sizeof(r_str), str, dev)) return -1; return get_rate64(rate, r_str); From 9e46c5c2063f2f8aa775d4fa17de3a82daeea47d Mon Sep 17 00:00:00 2001 From: Marcos Antonio Moraes Date: Thu, 7 Feb 2019 13:29:54 -0200 Subject: [PATCH 08/19] tc: use bits not mbits/sec in rate percent As /sys/class/net//speed indicates a value in Mbits/sec, the conversion is necessary to create the correct limits. This guarantees the same result for the following commands in an 1000Mbit/sec device: tc class add ... htb rate 500Mbit tc class add ... htb rate 50% Fixes: 927e3cfb52b5 ("tc: B.W limits can now be specified in %.") Signed-off-by: Marcos Antonio Moraes Signed-off-by: Stephen Hemminger --- tc/tc_util.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tc/tc_util.c b/tc/tc_util.c index 4e289ae9..07216fba 100644 --- a/tc/tc_util.c +++ b/tc/tc_util.c @@ -195,7 +195,7 @@ static int parse_percent_rate(char *rate, size_t len, { long dev_mbit; int ret; - double perc, rate_mbit; + double perc, rate_bit; char *str_perc = NULL; if (!dev[0]) { @@ -220,9 +220,9 @@ static int parse_percent_rate(char *rate, size_t len, return -1; } - rate_mbit = perc * dev_mbit; + rate_bit = perc * dev_mbit * 1000 * 1000; - ret = snprintf(rate, len, "%lf", rate_mbit); + ret = snprintf(rate, len, "%lf", rate_bit); if (ret <= 0 || ret >= len) { fprintf(stderr, "Unable to parse calculated rate\n"); return -1; From bb5ae621d0c7b9caf3a101903783bd5a1c997fa4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Feb 2019 17:58:41 -0800 Subject: [PATCH 09/19] lib/libnetlink: ensure a minimum of 32KB for the buffer used in rtnl_recvmsg() In the past, we tried to increase the buffer size up to 32 KB in order to reduce number of syscalls per dump. Commit 2d34851cd341 ("lib/libnetlink: re malloc buff if size is not enough") brought the size back to 4KB because the kernel can not know the application is ready to receive bigger requests. See kernel commits 9063e21fb026 ("netlink: autosize skb lengthes") and d35c99ff77ec ("netlink: do not enter direct reclaim from netlink_dump()") for more details. Fixes: 2d34851cd341 ("lib/libnetlink: re malloc buff if size is not enough") Signed-off-by: Eric Dumazet Cc: Hangbin Liu Cc: Phil Sutter Signed-off-by: Stephen Hemminger --- lib/libnetlink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 1892a02a..0d48a3d4 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -718,6 +718,8 @@ static int rtnl_recvmsg(int fd, struct msghdr *msg, char **answer) if (len < 0) return len; + if (len < 32768) + len = 32768; buf = malloc(len); if (!buf) { fprintf(stderr, "malloc error: not enough buffer\n"); From 0f3f0ca3a2aef77b0e4009a8de31cb48f58993fc Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 13 Feb 2019 15:39:01 +0300 Subject: [PATCH 10/19] ss: add option --tos for requesting ipv4 tos and ipv6 tclass Also show socket class_id/priority used by classful qdisc. Kernel report this together with tclass since commit ("inet_diag: fix reporting cgroup classid and fallback to priority") Signed-off-by: Konstantin Khlebnikov Signed-off-by: Stephen Hemminger --- man/man8/ss.8 | 17 +++++++++++++++++ misc/ss.c | 27 +++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/man/man8/ss.8 b/man/man8/ss.8 index 553a6cf4..9f21202d 100644 --- a/man/man8/ss.8 +++ b/man/man8/ss.8 @@ -244,6 +244,23 @@ the pacing rate and max pacing rate a helper variable for TCP internal auto tuning socket receive buffer .RE .TP +.B \-\-tos +Show ToS and priority information. Below fields may appear: +.RS +.P +.TP +.B tos +IPv4 Type-of-Service byte +.P +.TP +.B tclass +IPv6 Traffic Class byte +.P +.TP +.B class_id +Class id set by net_cls cgroup. If class is zero this shows priority set by SO_PRIORITY. +.RE +.TP .B \-K, \-\-kill Attempts to forcibly close sockets. This option displays sockets that are successfully closed and silently skips sockets that the kernel does not support diff --git a/misc/ss.c b/misc/ss.c index 3589ebed..9e821faf 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -110,6 +110,7 @@ static int show_header = 1; static int follow_events; static int sctp_ino; static int show_tipcinfo; +static int show_tos; enum col_id { COL_NETID, @@ -3008,6 +3009,15 @@ static int inet_show_sock(struct nlmsghdr *nlh, } } + if (show_tos) { + if (tb[INET_DIAG_TOS]) + out(" tos:%#x", rta_getattr_u8(tb[INET_DIAG_TOS])); + if (tb[INET_DIAG_TCLASS]) + out(" tclass:%#x", rta_getattr_u8(tb[INET_DIAG_TCLASS])); + if (tb[INET_DIAG_CLASS_ID]) + out(" class_id:%#x", rta_getattr_u32(tb[INET_DIAG_CLASS_ID])); + } + if (show_mem || (show_tcpinfo && s->type != IPPROTO_UDP)) { out("\n\t"); if (s->type == IPPROTO_SCTP) @@ -3058,6 +3068,11 @@ static int tcpdiag_send(int fd, int protocol, struct filter *f) req.r.idiag_ext |= (1<<(INET_DIAG_CONG-1)); } + if (show_tos) { + req.r.idiag_ext |= (1<<(INET_DIAG_TOS-1)); + req.r.idiag_ext |= (1<<(INET_DIAG_TCLASS-1)); + } + iov[0] = (struct iovec){ .iov_base = &req, .iov_len = sizeof(req) @@ -3118,6 +3133,11 @@ static int sockdiag_send(int family, int fd, int protocol, struct filter *f) req.r.idiag_ext |= (1<<(INET_DIAG_CONG-1)); } + if (show_tos) { + req.r.idiag_ext |= (1<<(INET_DIAG_TOS-1)); + req.r.idiag_ext |= (1<<(INET_DIAG_TCLASS-1)); + } + iov[0] = (struct iovec){ .iov_base = &req, .iov_len = sizeof(req) @@ -4661,6 +4681,7 @@ static void _usage(FILE *dest) " -i, --info show internal TCP information\n" " --tipcinfo show internal tipc socket information\n" " -s, --summary show socket usage summary\n" +" --tos show tos and priority information\n" " -b, --bpf show bpf filter socket information\n" " -E, --events continually display sockets as they are destroyed\n" " -Z, --context display process SELinux security contexts\n" @@ -4765,6 +4786,8 @@ static int scan_state(const char *state) #define OPT_TIPCSOCK 257 #define OPT_TIPCINFO 258 +#define OPT_TOS 259 + static const struct option long_opts[] = { { "numeric", 0, 0, 'n' }, { "resolve", 0, 0, 'r' }, @@ -4800,6 +4823,7 @@ static const struct option long_opts[] = { { "contexts", 0, 0, 'z' }, { "net", 1, 0, 'N' }, { "tipcinfo", 0, 0, OPT_TIPCINFO}, + { "tos", 0, 0, OPT_TOS }, { "kill", 0, 0, 'K' }, { "no-header", 0, 0, 'H' }, { 0 } @@ -4977,6 +5001,9 @@ int main(int argc, char *argv[]) case OPT_TIPCINFO: show_tipcinfo = 1; break; + case OPT_TOS: + show_tos = 1; + break; case 'K': current_filter.kill = 1; break; From 619765fe14b640c9aa8eff73fa5bc32da3cbfe80 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Wed, 13 Feb 2019 15:40:30 +0100 Subject: [PATCH 11/19] iplink: document XDP subcommand to force the XDP mode. When attaching an eBPF program to a device, ip link can force the XDP mode by using the xdp{generic,drv,offload} keyword instead of just 'xdp'. Document this behaviour also in the help output. Signed-off-by: Matteo Croce Fixes: 14683814 ("bpf: add xdpdrv for requesting XDP driver mode") Fixes: 1b5e8094 ("bpf: allow requesting XDP HW offload") Signed-off-by: Stephen Hemminger --- ip/iplink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/iplink.c b/ip/iplink.c index b5519201..3a0cf459 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -98,7 +98,7 @@ void iplink_usage(void) " [ trust { on | off} ] ]\n" " [ node_guid { eui64 } ]\n" " [ port_guid { eui64 } ]\n" - " [ xdp { off |\n" + " [ { xdp | xdpgeneric | xdpdrv | xdpoffload } { off |\n" " object FILE [ section NAME ] [ verbose ] |\n" " pinned FILE } ]\n" " [ master DEVICE ][ vrf NAME ]\n" From c2f9dc14c41f388764f7634d36c3d05e354f053a Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Thu, 14 Feb 2019 23:29:18 +0000 Subject: [PATCH 12/19] ip route: get: allow zero-length subnet mask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A /0 subnet mask is theoretically valid, but ip route get doesn't allow it: $ ip route get 1.0.0.0/0 need at least a destination address Change the check and remember whether we found an address or not, since according to the documentation it's a mandatory parameter. $ ip/ip route get 1.0.0.0/0 1.0.0.0 via 192.168.1.1 dev eth0 src 192.168.1.91 uid 1000 cache Reported-by: Clément Hertling Signed-off-by: Luca Boccassi Signed-off-by: Stephen Hemminger --- ip/iproute.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ip/iproute.c b/ip/iproute.c index 5f58a3b3..cc02a3e1 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -1932,6 +1932,7 @@ static int iproute_get(int argc, char **argv) int fib_match = 0; int from_ok = 0; unsigned int mark = 0; + bool address_found = false; iproute_reset_filter(0); filter.cloned = 2; @@ -2037,11 +2038,12 @@ static int iproute_get(int argc, char **argv) addattr_l(&req.n, sizeof(req), RTA_DST, &addr.data, addr.bytelen); req.r.rtm_dst_len = addr.bitlen; + address_found = true; } argc--; argv++; } - if (req.r.rtm_dst_len == 0) { + if (!address_found) { fprintf(stderr, "need at least a destination address\n"); return -1; } From f5f8e96953453c056872ac22a4a311e4931b3fb1 Mon Sep 17 00:00:00 2001 From: Thomas Haller Date: Tue, 19 Feb 2019 21:50:19 +0100 Subject: [PATCH 13/19] ip-rule: fix json key "to_tbl" for unspecific rule action The key should not be called "to_tbl" because it is exactly not a FR_ACT_TO_TBL action. Change it to "action". # ip rule add blackhole # ip -j rule | python -m json.tool ... { "priority": 0, "src": "all", "to_tbl": "blackhole" }, This is an API break of JSON output as it was added in v4.17.0. Still change it as the API is relatively new and unstable. Fixes: 0dd4ccc56c0e ("iprule: add json support") Signed-off-by: Thomas Haller Signed-off-by: Stephen Hemminger --- ip/iprule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/iprule.c b/ip/iprule.c index 2f58d8c2..4e9437de 100644 --- a/ip/iprule.c +++ b/ip/iprule.c @@ -459,7 +459,7 @@ int print_rule(struct nlmsghdr *n, void *arg) } else if (frh->action == FR_ACT_NOP) { print_null(PRINT_ANY, "nop", "nop", NULL); } else if (frh->action != FR_ACT_TO_TBL) { - print_string(PRINT_ANY, "to_tbl", "%s", + print_string(PRINT_ANY, "action", "%s", rtnl_rtntype_n2a(frh->action, b1, sizeof(b1))); } From d7cf2416fc3a08b411beffb93a9e118f6593892d Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 21 Feb 2019 19:37:51 +0100 Subject: [PATCH 14/19] ip-address: Use correct max attribute value in print_vf_stats64() IFLA_VF_MAX is larger than the highest valid index in vf array. Fixes: a1b99717c7cd7 ("Add displaying VF traffic statistics") Signed-off-by: Phil Sutter Signed-off-by: Stephen Hemminger --- ip/ipaddress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 2bc33f3a..76edf706 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -547,7 +547,7 @@ static void print_vf_stats64(FILE *fp, struct rtattr *vfstats) return; } - parse_rtattr_nested(vf, IFLA_VF_MAX, vfstats); + parse_rtattr_nested(vf, IFLA_VF_STATS_MAX, vfstats); if (is_json_context()) { open_json_object("stats"); From 0e7e1819453cc5bc5610c896d3cbc5a30b48b164 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 21 Feb 2019 11:55:56 +0100 Subject: [PATCH 15/19] devlink: relax dpipe table show dependency on resources Dpipe table show command has a depencency on getting resources. If resource get command is not supported by the driver, dpipe table show fails. However, resource is only additional information in dpipe table show output. So relax the dependency and let the dpipe tables be shown even if resources get command fails. Fixes: ead180274caf ("devlink: Add support for resource/dpipe relation") Signed-off-by: Jiri Pirko Signed-off-by: Stephen Hemminger --- devlink/devlink.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/devlink/devlink.c b/devlink/devlink.c index 3651e90c..cced8d61 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -4351,7 +4351,8 @@ static int dpipe_table_show(struct dpipe_ctx *ctx, struct nlattr *nl) size = mnl_attr_get_u32(nla_table[DEVLINK_ATTR_DPIPE_TABLE_SIZE]); counters_enabled = !!mnl_attr_get_u8(nla_table[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]); - resource_valid = !!nla_table[DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID]; + resource_valid = nla_table[DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID] && + ctx->resources; if (resource_valid) { table->resource_id = mnl_attr_get_u64(nla_table[DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID]); table->resource_valid = true; @@ -4467,12 +4468,9 @@ static int cmd_dpipe_table_show(struct dl *dl) dl_opts_put(nlh, dl); err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_resource_dump_cb, &resource_ctx); - if (err) { - pr_err("error get resources %s\n", strerror(resource_ctx.err)); - goto err_resource_dump; - } + if (!err) + dpipe_ctx.resources = resource_ctx.resources; - dpipe_ctx.resources = resource_ctx.resources; flags = NLM_F_REQUEST | NLM_F_ACK; nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_DPIPE_TABLE_GET, flags); dl_opts_put(nlh, dl); @@ -4485,8 +4483,6 @@ static int cmd_dpipe_table_show(struct dl *dl) dpipe_ctx_fini(&dpipe_ctx); return 0; -err_resource_dump: - resource_ctx_fini(&resource_ctx); err_resource_ctx_init: err_headers_get: dpipe_ctx_fini(&dpipe_ctx); From 02723cf230bfedb0918ae8a119d20cf4fd65091b Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 20 Feb 2019 11:33:57 -0500 Subject: [PATCH 16/19] bridge: make mcast_flood description consistent This patch simply changes the description of the mcast_flood flag with "flood" instead of "be flooded with" to avoid confusion, and be consistent with the description of the flooding flag, which "Controls whether a given port will *flood* unicast traffic for which there is no FDB entry." At the same time, fix the documentation for the "flood" flag which is incorrectly described as "flooding on" or "flooding off". Signed-off-by: Vivien Didelot Signed-off-by: Stephen Hemminger --- man/man8/bridge.8 | 4 ++-- man/man8/ip-link.8.in | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index 72210f62..13c46386 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -344,7 +344,7 @@ Controls whether a given port will sync MAC addresses learned on device port to bridge FDB. .TP -.BR "flooding on " or " flooding off " +.BR "flood on " or " flood off " Controls whether a given port will flood unicast traffic for which there is no FDB entry. By default this flag is on. .TP @@ -361,7 +361,7 @@ switch. .TP .BR "mcast_flood on " or " mcast_flood off " -Controls whether a given port will be flooded with multicast traffic for which there is no MDB entry. By default this flag is on. +Controls whether a given port will flood multicast traffic for which there is no MDB entry. By default this flag is on. .TP .BR "neigh_suppress on " or " neigh_suppress off " diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 73d37c19..6f31453c 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -2183,7 +2183,7 @@ queries. option above. .BR mcast_flood " { " on " | " off " }" -- controls whether a given port will be flooded with multicast traffic for which there is no MDB entry. +- controls whether a given port will flood multicast traffic for which there is no MDB entry. .BI group_fwd_mask " MASK " - set the group forward mask. This is the bitmask that is applied to decide whether to forward incoming frames destined to link-local addresses, ie addresses of the form 01:80:C2:00:00:0X (defaults to 0, ie the bridge does not forward any link-local frames coming on this port). From 6f618a6a82a9b4cabec9aa5589e36efba339fd38 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 21 Feb 2019 14:24:07 -0800 Subject: [PATCH 17/19] uapi: update inet_diag_info.h Upstream changes. Signed-off-by: Stephen Hemminger --- include/uapi/linux/inet_diag.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index f98d82d4..f3bcd7ee 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -137,15 +137,21 @@ enum { INET_DIAG_TCLASS, INET_DIAG_SKMEMINFO, INET_DIAG_SHUTDOWN, - INET_DIAG_DCTCPINFO, - INET_DIAG_PROTOCOL, /* response attribute only */ + + /* + * Next extenstions cannot be requested in struct inet_diag_req_v2: + * its field idiag_ext has only 8 bits. + */ + + INET_DIAG_DCTCPINFO, /* request as INET_DIAG_VEGASINFO */ + INET_DIAG_PROTOCOL, /* response attribute only */ INET_DIAG_SKV6ONLY, INET_DIAG_LOCALS, INET_DIAG_PEERS, INET_DIAG_PAD, - INET_DIAG_MARK, - INET_DIAG_BBRINFO, - INET_DIAG_CLASS_ID, + INET_DIAG_MARK, /* only with CAP_NET_ADMIN */ + INET_DIAG_BBRINFO, /* request as INET_DIAG_VEGASINFO */ + INET_DIAG_CLASS_ID, /* request as INET_DIAG_TCLASS */ INET_DIAG_MD5SIG, __INET_DIAG_MAX, }; From 9700927a008a803ac119bdf816bdc1baa69d705c Mon Sep 17 00:00:00 2001 From: Thomas De Schampheleire Date: Wed, 20 Feb 2019 15:41:51 +0100 Subject: [PATCH 18/19] ss: fix compilation under glibc < 2.18 Commit c759116a0b2b6da8df9687b0a40ac69050132c77 introduced support for AF_VSOCK. This define is only provided since glibc version 2.18, so compilation fails when using older toolchains. Provide the necessary definitions if needed. Signed-off-by: Thomas De Schampheleire Signed-off-by: Stephen Hemminger --- misc/ss.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/misc/ss.c b/misc/ss.c index 9e821faf..766fdc5f 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -51,6 +51,14 @@ #include #include +/* AF_VSOCK/PF_VSOCK is only provided since glibc 2.18 */ +#ifndef PF_VSOCK +#define PF_VSOCK 40 +#endif +#ifndef AF_VSOCK +#define AF_VSOCK PF_VSOCK +#endif + #define MAGIC_SEQ 123456 #define BUF_CHUNK (1024 * 1024) #define LEN_ALIGN(x) (((x) + 1) & ~1) From aa5bd6a252ce46ee6757458f08a071aabdae9264 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 14 Feb 2019 01:58:32 +0100 Subject: [PATCH 19/19] ss: Render buffer to output every time a number of chunks are allocated Eric reported that, with 10 million sockets, ss -emoi (about 1000 bytes output per socket) can easily lead to OOM (buffer would grow to 10GB of memory). Limit the maximum size of the buffer to five chunks, 1M each. Render and flush buffers whenever we reach that. This might make the resulting blocks slightly unaligned between them, with occasional loss of readability on lines occurring every 5k to 50k sockets approximately. Something like (from ss -tu): [...] CLOSE-WAIT 32 0 192.168.1.50:35232 10.0.0.1:https ESTAB 0 0 192.168.1.50:53820 10.0.0.1:https ESTAB 0 0 192.168.1.50:46924 10.0.0.1:https CLOSE-WAIT 32 0 192.168.1.50:35228 10.0.0.1:https [...] However, I don't actually expect any human user to scroll through that amount of sockets, so readability should be preserved when it matters. The bulk of the diffstat comes from moving field_next() around, as we now call render() from it. Functionally, this is implemented by six lines of code, most of them in field_next(). Reported-by: Eric Dumazet Fixes: 691bd854bf4a ("ss: Buffer raw fields first, then render them as a table") Signed-off-by: Stefano Brivio Signed-off-by: Stephen Hemminger --- misc/ss.c | 68 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/misc/ss.c b/misc/ss.c index 766fdc5f..e9033503 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -60,7 +60,8 @@ #endif #define MAGIC_SEQ 123456 -#define BUF_CHUNK (1024 * 1024) +#define BUF_CHUNK (1024 * 1024) /* Buffer chunk allocation size */ +#define BUF_CHUNKS_MAX 5 /* Maximum number of allocated buffer chunks */ #define LEN_ALIGN(x) (((x) + 1) & ~1) #define DIAG_REQUEST(_req, _r) \ @@ -184,6 +185,7 @@ static struct { struct buf_token *cur; /* Position of current token in chunk */ struct buf_chunk *head; /* First chunk */ struct buf_chunk *tail; /* Current chunk */ + int chunks; /* Number of allocated chunks */ } buffer; static const char *TCP_PROTO = "tcp"; @@ -944,6 +946,8 @@ static struct buf_chunk *buf_chunk_new(void) new->end = buffer.cur->data; + buffer.chunks++; + return new; } @@ -1088,33 +1092,6 @@ static int field_is_last(struct column *f) return f - columns == COL_MAX - 1; } -static void field_next(void) -{ - field_flush(current_field); - - if (field_is_last(current_field)) - current_field = columns; - else - current_field++; -} - -/* Walk through fields and flush them until we reach the desired one */ -static void field_set(enum col_id id) -{ - while (id != current_field - columns) - field_next(); -} - -/* Print header for all non-empty columns */ -static void print_header(void) -{ - while (!field_is_last(current_field)) { - if (!current_field->disabled) - out("%s", current_field->header); - field_next(); - } -} - /* Get the next available token in the buffer starting from the current token */ static struct buf_token *buf_token_next(struct buf_token *cur) { @@ -1140,6 +1117,7 @@ static void buf_free_all(void) free(tmp); } buffer.head = NULL; + buffer.chunks = 0; } /* Get current screen width, default to 80 columns if TIOCGWINSZ fails */ @@ -1302,6 +1280,40 @@ static void render(void) current_field = columns; } +/* Move to next field, and render buffer if we reached the maximum number of + * chunks, at the last field in a line. + */ +static void field_next(void) +{ + if (field_is_last(current_field) && buffer.chunks >= BUF_CHUNKS_MAX) { + render(); + return; + } + + field_flush(current_field); + if (field_is_last(current_field)) + current_field = columns; + else + current_field++; +} + +/* Walk through fields and flush them until we reach the desired one */ +static void field_set(enum col_id id) +{ + while (id != current_field - columns) + field_next(); +} + +/* Print header for all non-empty columns */ +static void print_header(void) +{ + while (!field_is_last(current_field)) { + if (!current_field->disabled) + out("%s", current_field->header); + field_next(); + } +} + static void sock_state_print(struct sockstat *s) { const char *sock_name;