From 1a22ad2721fbd970e3d9a97af98427567c65d05d Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 12 Jan 2021 12:33:17 +0200 Subject: [PATCH 01/27] build: Fix link errors on some systems Since moving get_rate() and get_size() from tc to lib, on some systems we fail to link because of missing math lib. Move the functions that require math lib to their own c file and add -lm to dcb that now use those functions. ../lib/libutil.a(utils.o): In function `get_rate': utils.c:(.text+0x10dc): undefined reference to `floor' ../lib/libutil.a(utils.o): In function `get_size': utils.c:(.text+0x1394): undefined reference to `floor' ../lib/libutil.a(json_print.o): In function `sprint_size': json_print.c:(.text+0x14c0): undefined reference to `rint' json_print.c:(.text+0x14f4): undefined reference to `rint' json_print.c:(.text+0x157c): undefined reference to `rint' Fixes: f3be0e6366ac ("lib: Move get_rate(), get_rate64() from tc here") Fixes: 44396bdfcc0a ("lib: Move get_size() from tc here") Fixes: adbe5de96662 ("lib: Move sprint_size() from tc here, add print_size()") Signed-off-by: Roi Dayan Reviewed-by: Petr Machata Tested-by: Petr Machata Signed-off-by: Stephen Hemminger --- dcb/Makefile | 1 + include/json_print.h | 3 ++ lib/Makefile | 4 +- lib/json_print.c | 33 ------------ lib/json_print_math.c | 37 +++++++++++++ lib/utils.c | 114 --------------------------------------- lib/utils_math.c | 123 ++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 166 insertions(+), 149 deletions(-) create mode 100644 lib/json_print_math.c create mode 100644 lib/utils_math.c diff --git a/dcb/Makefile b/dcb/Makefile index 4add954b..7c09bb4f 100644 --- a/dcb/Makefile +++ b/dcb/Makefile @@ -7,6 +7,7 @@ ifeq ($(HAVE_MNL),y) DCBOBJ = dcb.o dcb_buffer.o dcb_ets.o dcb_maxrate.o dcb_pfc.o TARGETS += dcb +LDLIBS += -lm endif diff --git a/include/json_print.h b/include/json_print.h index 1a1ad5ff..6fcf9fd9 100644 --- a/include/json_print.h +++ b/include/json_print.h @@ -15,6 +15,9 @@ #include "json_writer.h" #include "color.h" +#define _IS_JSON_CONTEXT(type) (is_json_context() && (type & PRINT_JSON || type & PRINT_ANY)) +#define _IS_FP_CONTEXT(type) (!is_json_context() && (type & PRINT_FP || type & PRINT_ANY)) + json_writer_t *get_json_writer(void); /* diff --git a/lib/Makefile b/lib/Makefile index 764c9137..6c98f9a6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -3,8 +3,8 @@ include ../config.mk CFLAGS += -fPIC -UTILOBJ = utils.o rt_names.o ll_map.o ll_types.o ll_proto.o ll_addr.o \ - inet_proto.o namespace.o json_writer.o json_print.o \ +UTILOBJ = utils.o utils_math.o rt_names.o ll_map.o ll_types.o ll_proto.o ll_addr.o \ + inet_proto.o namespace.o json_writer.o json_print.o json_print_math.o \ names.o color.o bpf_legacy.o bpf_glue.o exec.o fs.o cg_map.o ifeq ($(HAVE_ELF),y) diff --git a/lib/json_print.c b/lib/json_print.c index b086123a..994a2f8d 100644 --- a/lib/json_print.c +++ b/lib/json_print.c @@ -11,16 +11,12 @@ #include #include -#include #include "utils.h" #include "json_print.h" static json_writer_t *_jw; -#define _IS_JSON_CONTEXT(type) ((type & PRINT_JSON || type & PRINT_ANY) && _jw) -#define _IS_FP_CONTEXT(type) (!_jw && (type & PRINT_FP || type & PRINT_ANY)) - static void __new_json_obj(int json, bool have_array) { if (json) { @@ -342,32 +338,3 @@ int print_color_rate(bool use_iec, enum output_type type, enum color_attr color, free(buf); return rc; } - -char *sprint_size(__u32 sz, char *buf) -{ - long kilo = 1024; - long mega = kilo * kilo; - size_t len = SPRINT_BSIZE - 1; - double tmp = sz; - - if (sz >= mega && fabs(mega * rint(tmp / mega) - sz) < 1024) - snprintf(buf, len, "%gMb", rint(tmp / mega)); - else if (sz >= kilo && fabs(kilo * rint(tmp / kilo) - sz) < 16) - snprintf(buf, len, "%gKb", rint(tmp / kilo)); - else - snprintf(buf, len, "%ub", sz); - - return buf; -} - -int print_color_size(enum output_type type, enum color_attr color, - const char *key, const char *fmt, __u32 sz) -{ - SPRINT_BUF(buf); - - if (_IS_JSON_CONTEXT(type)) - return print_color_uint(type, color, key, "%u", sz); - - sprint_size(sz, buf); - return print_color_string(type, color, key, fmt, buf); -} diff --git a/lib/json_print_math.c b/lib/json_print_math.c new file mode 100644 index 00000000..f4d50499 --- /dev/null +++ b/lib/json_print_math.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include + +#include "utils.h" +#include "json_print.h" + +char *sprint_size(__u32 sz, char *buf) +{ + long kilo = 1024; + long mega = kilo * kilo; + size_t len = SPRINT_BSIZE - 1; + double tmp = sz; + + if (sz >= mega && fabs(mega * rint(tmp / mega) - sz) < 1024) + snprintf(buf, len, "%gMb", rint(tmp / mega)); + else if (sz >= kilo && fabs(kilo * rint(tmp / kilo) - sz) < 16) + snprintf(buf, len, "%gKb", rint(tmp / kilo)); + else + snprintf(buf, len, "%ub", sz); + + return buf; +} + +int print_color_size(enum output_type type, enum color_attr color, + const char *key, const char *fmt, __u32 sz) +{ + SPRINT_BUF(buf); + + if (_IS_JSON_CONTEXT(type)) + return print_color_uint(type, color, key, "%u", sz); + + sprint_size(sz, buf); + return print_color_string(type, color, key, fmt, buf); +} diff --git a/lib/utils.c b/lib/utils.c index de875639..a0ba5181 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -513,120 +513,6 @@ int get_addr64(__u64 *ap, const char *cp) return 1; } -/* See http://physics.nist.gov/cuu/Units/binary.html */ -static const struct rate_suffix { - const char *name; - double scale; -} suffixes[] = { - { "bit", 1. }, - { "Kibit", 1024. }, - { "kbit", 1000. }, - { "mibit", 1024.*1024. }, - { "mbit", 1000000. }, - { "gibit", 1024.*1024.*1024. }, - { "gbit", 1000000000. }, - { "tibit", 1024.*1024.*1024.*1024. }, - { "tbit", 1000000000000. }, - { "Bps", 8. }, - { "KiBps", 8.*1024. }, - { "KBps", 8000. }, - { "MiBps", 8.*1024*1024. }, - { "MBps", 8000000. }, - { "GiBps", 8.*1024.*1024.*1024. }, - { "GBps", 8000000000. }, - { "TiBps", 8.*1024.*1024.*1024.*1024. }, - { "TBps", 8000000000000. }, - { NULL } -}; - -int get_rate(unsigned int *rate, const char *str) -{ - char *p; - double bps = strtod(str, &p); - const struct rate_suffix *s; - - if (p == str) - return -1; - - for (s = suffixes; s->name; ++s) { - if (strcasecmp(s->name, p) == 0) { - bps *= s->scale; - p += strlen(p); - break; - } - } - - if (*p) - return -1; /* unknown suffix */ - - bps /= 8; /* -> bytes per second */ - *rate = bps; - /* detect if an overflow happened */ - if (*rate != floor(bps)) - return -1; - return 0; -} - -int get_rate64(__u64 *rate, const char *str) -{ - char *p; - double bps = strtod(str, &p); - const struct rate_suffix *s; - - if (p == str) - return -1; - - for (s = suffixes; s->name; ++s) { - if (strcasecmp(s->name, p) == 0) { - bps *= s->scale; - p += strlen(p); - break; - } - } - - if (*p) - return -1; /* unknown suffix */ - - bps /= 8; /* -> bytes per second */ - *rate = bps; - return 0; -} - -int get_size(unsigned int *size, const char *str) -{ - double sz; - char *p; - - sz = strtod(str, &p); - if (p == str) - return -1; - - if (*p) { - if (strcasecmp(p, "kb") == 0 || strcasecmp(p, "k") == 0) - sz *= 1024; - else if (strcasecmp(p, "gb") == 0 || strcasecmp(p, "g") == 0) - sz *= 1024*1024*1024; - else if (strcasecmp(p, "gbit") == 0) - sz *= 1024*1024*1024/8; - else if (strcasecmp(p, "mb") == 0 || strcasecmp(p, "m") == 0) - sz *= 1024*1024; - else if (strcasecmp(p, "mbit") == 0) - sz *= 1024*1024/8; - else if (strcasecmp(p, "kbit") == 0) - sz *= 1024/8; - else if (strcasecmp(p, "b") != 0) - return -1; - } - - *size = sz; - - /* detect if an overflow happened */ - if (*size != floor(sz)) - return -1; - - return 0; -} - static void set_address_type(inet_prefix *addr) { switch (addr->family) { diff --git a/lib/utils_math.c b/lib/utils_math.c new file mode 100644 index 00000000..9ef3dd6e --- /dev/null +++ b/lib/utils_math.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include +#include + +#include "utils.h" + +/* See http://physics.nist.gov/cuu/Units/binary.html */ +static const struct rate_suffix { + const char *name; + double scale; +} suffixes[] = { + { "bit", 1. }, + { "Kibit", 1024. }, + { "kbit", 1000. }, + { "mibit", 1024.*1024. }, + { "mbit", 1000000. }, + { "gibit", 1024.*1024.*1024. }, + { "gbit", 1000000000. }, + { "tibit", 1024.*1024.*1024.*1024. }, + { "tbit", 1000000000000. }, + { "Bps", 8. }, + { "KiBps", 8.*1024. }, + { "KBps", 8000. }, + { "MiBps", 8.*1024*1024. }, + { "MBps", 8000000. }, + { "GiBps", 8.*1024.*1024.*1024. }, + { "GBps", 8000000000. }, + { "TiBps", 8.*1024.*1024.*1024.*1024. }, + { "TBps", 8000000000000. }, + { NULL } +}; + +int get_rate(unsigned int *rate, const char *str) +{ + char *p; + double bps = strtod(str, &p); + const struct rate_suffix *s; + + if (p == str) + return -1; + + for (s = suffixes; s->name; ++s) { + if (strcasecmp(s->name, p) == 0) { + bps *= s->scale; + p += strlen(p); + break; + } + } + + if (*p) + return -1; /* unknown suffix */ + + bps /= 8; /* -> bytes per second */ + *rate = bps; + /* detect if an overflow happened */ + if (*rate != floor(bps)) + return -1; + return 0; +} + +int get_rate64(__u64 *rate, const char *str) +{ + char *p; + double bps = strtod(str, &p); + const struct rate_suffix *s; + + if (p == str) + return -1; + + for (s = suffixes; s->name; ++s) { + if (strcasecmp(s->name, p) == 0) { + bps *= s->scale; + p += strlen(p); + break; + } + } + + if (*p) + return -1; /* unknown suffix */ + + bps /= 8; /* -> bytes per second */ + *rate = bps; + return 0; +} + +int get_size(unsigned int *size, const char *str) +{ + double sz; + char *p; + + sz = strtod(str, &p); + if (p == str) + return -1; + + if (*p) { + if (strcasecmp(p, "kb") == 0 || strcasecmp(p, "k") == 0) + sz *= 1024; + else if (strcasecmp(p, "gb") == 0 || strcasecmp(p, "g") == 0) + sz *= 1024*1024*1024; + else if (strcasecmp(p, "gbit") == 0) + sz *= 1024*1024*1024/8; + else if (strcasecmp(p, "mb") == 0 || strcasecmp(p, "m") == 0) + sz *= 1024*1024; + else if (strcasecmp(p, "mbit") == 0) + sz *= 1024*1024/8; + else if (strcasecmp(p, "kbit") == 0) + sz *= 1024/8; + else if (strcasecmp(p, "b") != 0) + return -1; + } + + *size = sz; + + /* detect if an overflow happened */ + if (*size != floor(sz)) + return -1; + + return 0; +} From 8dca565b1729ea7773e4299f10ec1b0e49d1af44 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Sun, 17 Jan 2021 22:54:26 +0000 Subject: [PATCH 02/27] vrf: print BPF log buffer if bpf_program_load fails Necessary to understand what is going on when bpf_program_load fails Signed-off-by: Luca Boccassi Reviewed-by: David Ahern Signed-off-by: Stephen Hemminger --- ip/ipvrf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ip/ipvrf.c b/ip/ipvrf.c index 42779e5c..91578031 100644 --- a/ip/ipvrf.c +++ b/ip/ipvrf.c @@ -278,8 +278,8 @@ static int vrf_configure_cgroup(const char *path, int ifindex) */ prog_fd = prog_load(ifindex); if (prog_fd < 0) { - fprintf(stderr, "Failed to load BPF prog: '%s'\n", - strerror(errno)); + fprintf(stderr, "Failed to load BPF prog: '%s'\n%s", + strerror(errno), bpf_log_buf); if (errno != EPERM) { fprintf(stderr, From 8498ca92d7d734570f108b18d7d0858560c4ac07 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Sun, 17 Jan 2021 22:54:27 +0000 Subject: [PATCH 03/27] vrf: fix ip vrf exec with libbpf The size of bpf_insn is passed to bpf_load_program instead of the number of elements as it expects, so ip vrf exec fails with: $ sudo ip link add vrf-blue type vrf table 10 $ sudo ip link set dev vrf-blue up $ sudo ip/ip vrf exec vrf-blue ls Failed to load BPF prog: 'Invalid argument' last insn is not an exit or jmp processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 Kernel compiled with CGROUP_BPF enabled? https://bugs.debian.org/980046 Reported-by: Emmanuel DECAEN Signed-off-by: Luca Boccassi Reviewed-by: David Ahern Signed-off-by: Stephen Hemminger --- lib/bpf_glue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/bpf_glue.c b/lib/bpf_glue.c index fa609bfe..d00a0dc1 100644 --- a/lib/bpf_glue.c +++ b/lib/bpf_glue.c @@ -14,7 +14,8 @@ int bpf_program_load(enum bpf_prog_type type, const struct bpf_insn *insns, size_t size_log) { #ifdef HAVE_LIBBPF - return bpf_load_program(type, insns, size_insns, license, 0, log, size_log); + return bpf_load_program(type, insns, size_insns / sizeof(struct bpf_insn), + license, 0, log, size_log); #else return bpf_prog_load_dev(type, insns, size_insns, license, 0, log, size_log); #endif From 86d9660dc1805be4435497ff194f618535e8fc97 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 1 Feb 2021 18:44:07 +0100 Subject: [PATCH 04/27] iplink_bareudp: cleanup help message and man page * Fix PROTO description in help message (mpls isn't a valid argument). * Remove SRCPORTMIN description from help message since it doesn't appear in the syntax string. * Use same keywords in help message and in man page. * Use the "ethertype" option name (.B ethertype) rather than the option value (.I ETHERTYPE) in the man page description of [no]multiproto. Signed-off-by: Guillaume Nault Signed-off-by: Stephen Hemminger --- ip/iplink_bareudp.c | 8 +++++--- man/man8/ip-link.8.in | 15 +++++++++------ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/ip/iplink_bareudp.c b/ip/iplink_bareudp.c index 860ec699..aa311106 100644 --- a/ip/iplink_bareudp.c +++ b/ip/iplink_bareudp.c @@ -22,9 +22,11 @@ static void print_explain(FILE *f) " [ srcportmin PORT ]\n" " [ [no]multiproto ]\n" "\n" - "Where: PORT := 0-65535\n" - " PROTO := NUMBER | ip | mpls\n" - " SRCPORTMIN := 0-65535\n" + "Where: PORT := UDP_PORT\n" + " PROTO := ETHERTYPE\n" + "\n" + "Note: ETHERTYPE can be given as number or as protocol name (\"ipv4\", \"ipv6\",\n" + " \"mpls_uc\", etc.).\n" ); } diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 3516765a..fd67e611 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -1307,9 +1307,9 @@ For a link of type the following additional arguments are supported: .BI "ip link add " DEVICE -.BI type " bareudp " dstport " PORT " ethertype " ETHERTYPE" +.BI type " bareudp " dstport " PORT " ethertype " PROTO" [ -.BI srcportmin " SRCPORTMIN " +.BI srcportmin " PORT " ] [ .RB [ no ] multiproto ] @@ -1320,11 +1320,14 @@ the following additional arguments are supported: - specifies the destination port for the UDP tunnel. .sp -.BI ethertype " ETHERTYPE" +.BI ethertype " PROTO" - specifies the ethertype of the L3 protocol being tunnelled. +.B ethertype +can be given as plain Ethernet protocol number or using the protocol name +("ipv4", "ipv6", "mpls_uc", etc.). .sp -.BI srcportmin " SRCPORTMIN" +.BI srcportmin " PORT" - selects the lowest value of the UDP tunnel source port range. .sp @@ -1332,11 +1335,11 @@ the following additional arguments are supported: - activates support for protocols similar to the one .RB "specified by " ethertype . When -.I ETHERTYPE +.B ethertype is "mpls_uc" (that is, unicast MPLS), this allows the tunnel to also handle multicast MPLS. When -.I ETHERTYPE +.B ethertype is "ipv4", this allows the tunnel to also handle IPv6. This option is disabled by default. From 4712a4617408da5afabc9433c7316a99363fd053 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 22 Jan 2021 03:22:11 +0200 Subject: [PATCH 05/27] man: tc-taprio.8: document the full offload feature Since this feature's introduction in commit 9c66d1564676 ("taprio: Add support for hardware offloading") from kernel v5.4, it never got documented in the man pages. Due to this reason, we see customer reports of seemingly contradictory information: the community manpages claim there is no support for full offload, nonetheless many silicon vendors have already implemented it. This patch documents the full offload feature (enabled by specifying "flags 2" to the taprio qdisc) and gives one more example that tries to illustrate some of the finer points related to the usage. Signed-off-by: Vladimir Oltean Signed-off-by: Stephen Hemminger --- man/man8/tc-taprio.8 | 51 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/man/man8/tc-taprio.8 b/man/man8/tc-taprio.8 index e1d19ba1..d13c86f7 100644 --- a/man/man8/tc-taprio.8 +++ b/man/man8/tc-taprio.8 @@ -92,7 +92,11 @@ in the schedule; clockid .br Specifies the clock to be used by qdisc's internal timer for measuring -time and scheduling events. +time and scheduling events. This argument must be omitted when using the +full-offload feature (flags 0x2), since in that case, the clockid is +implicitly /dev/ptpN (where N is given by +.B ethtool -T eth0 | grep 'PTP Hardware Clock' +), and therefore not necessarily synchronized with the system's CLOCK_TAI. .TP sched-entry @@ -115,13 +119,27 @@ before moving to the next entry. .TP flags .br -Specifies different modes for taprio. Currently, only txtime-assist is -supported which can be enabled by setting it to 0x1. In this mode, taprio will -set the transmit timestamp depending on the interval in which the packet needs -to be transmitted. It will then utililize the +This is a bit mask which specifies different modes for taprio. +.RS +.TP +.I 0x1 +Enables the txtime-assist feature. In this mode, taprio will set the transmit +timestamp depending on the interval in which the packet needs to be +transmitted. It will then utililize the .BR etf(8) qdisc to sort and transmit the packets at the right time. The second example can be used as a reference to configure this mode. +.TP +.I 0x2 +Enables the full-offload feature. In this mode, taprio will pass the gate +control list to the NIC which will execute it cyclically in hardware. +When using full-offload, there is no need to specify the +.B clockid +argument. + +The txtime-assist and full-offload features are mutually exclusive, i.e. +setting flags to 0x3 is invalid. +.RE .TP txtime-delay @@ -178,5 +196,28 @@ for more information about configuring the ETF qdisc. offload delta 200000 clockid CLOCK_TAI .EE +The following is a schedule in full offload mode. The +.B base-time +is 200 ns and the +.B cycle-time +is implicitly calculated as the sum of all +.B sched-entry +durations (i.e. 20 us + 20 us + 60 us = 100 us). Although the base-time is in +the past, the hardware will start executing the schedule at a PTP time equal to +the smallest integer multiple of 100 us, plus 200 ns, that is larger than the +NIC's current PTP time. + +.EX +# tc qdisc add dev eth0 parent root taprio \\ + num_tc 8 \\ + map 0 1 2 3 4 5 6 7 \\ + queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \\ + base-time 200 \\ + sched-entry S 80 20000 \\ + sched-entry S a0 20000 \\ + sched-entry S df 60000 \\ + flags 0x2 +.EE + .SH AUTHORS Vinicius Costa Gomes From 3d6d9e6e67a8d0433c9bc3df6f80817d3473bdd4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 25 Jan 2021 17:02:07 +0100 Subject: [PATCH 06/27] ss: do not emit warn while dumping MPTCP on old kernels Prior to this commit, running 'ss' on a kernel older than v5.9 bumps an error message: RTNETLINK answers: Invalid argument When asked to dump protocol number > 255 - that is: MPTCP - 'ss' adds an INET_DIAG_REQ_PROTOCOL attribute, unsupported by the older kernel. Avoid the warning ignoring filter issues when INET_DIAG_REQ_PROTOCOL is used. Additionally older kernel end-up invoking tcpdiag_send(), which in turn will try to dump DCCP socks. Bail early in such function, as the kernel does not implement an MPTCPDIAG_GET request. Reported-by: "Rantala, Tommi T. (Nokia - FI/Espoo)" Fixes: 9c3be2c0eee0 ("ss: mptcp: add msk diag interface support") Signed-off-by: Paolo Abeni Signed-off-by: Stephen Hemminger --- misc/ss.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/misc/ss.c b/misc/ss.c index 0593627b..ad46f9db 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -3404,7 +3404,7 @@ static int tcpdiag_send(int fd, int protocol, struct filter *f) struct iovec iov[3]; int iovlen = 1; - if (protocol == IPPROTO_UDP) + if (protocol == IPPROTO_UDP || protocol == IPPROTO_MPTCP) return -1; if (protocol == IPPROTO_TCP) @@ -3623,6 +3623,14 @@ static int inet_show_netlink(struct filter *f, FILE *dump_fp, int protocol) if (preferred_family == PF_INET6) family = PF_INET6; + /* extended protocol will use INET_DIAG_REQ_PROTOCOL, + * not supported by older kernels. On such kernel + * rtnl_dump will bail with rtnl_dump_error(). + * Suppress the error to avoid confusing the user + */ + if (protocol > 255) + rth.flags |= RTNL_HANDLE_F_SUPPRESS_NLERR; + again: if ((err = sockdiag_send(family, rth.fd, protocol, f))) goto Exit; From 97647618882a5de74ea31068e114601fcd7178b7 Mon Sep 17 00:00:00 2001 From: Edwin Peer Date: Tue, 26 Jan 2021 09:40:53 -0800 Subject: [PATCH 07/27] iplink: print warning for missing VF data The kernel might truncate VF info in IFLA_VFINFO_LIST. Compare the expected number of VFs in IFLA_NUM_VF to how many were found in the list and warn accordingly. Signed-off-by: Edwin Peer Signed-off-by: Stephen Hemminger --- ip/ipaddress.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 571346b1..0bbcee2b 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -922,6 +922,7 @@ int print_linkinfo(struct nlmsghdr *n, void *arg) const char *name; unsigned int m_flag = 0; SPRINT_BUF(b1); + bool truncated_vfs = false; if (n->nlmsg_type != RTM_NEWLINK && n->nlmsg_type != RTM_DELLINK) return 0; @@ -1199,15 +1200,18 @@ int print_linkinfo(struct nlmsghdr *n, void *arg) if ((do_link || show_details) && tb[IFLA_VFINFO_LIST] && tb[IFLA_NUM_VF]) { struct rtattr *i, *vflist = tb[IFLA_VFINFO_LIST]; - int rem = RTA_PAYLOAD(vflist); + int rem = RTA_PAYLOAD(vflist), count = 0; open_json_array(PRINT_JSON, "vfinfo_list"); for (i = RTA_DATA(vflist); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) { open_json_object(NULL); print_vfinfo(fp, ifi, i); close_json_object(); + count++; } close_json_array(PRINT_JSON, NULL); + if (count != rta_getattr_u32(tb[IFLA_NUM_VF])) + truncated_vfs = true; } if (tb[IFLA_PROP_LIST]) { @@ -1228,6 +1232,9 @@ int print_linkinfo(struct nlmsghdr *n, void *arg) print_string(PRINT_FP, NULL, "%s", "\n"); fflush(fp); + /* prettier here if stderr and stdout go to the same place */ + if (truncated_vfs) + fprintf(stderr, "Truncated VF list: %s\n", name); return 1; } From df361a27c26a85c38a5f98e765fb306191a2c1ba Mon Sep 17 00:00:00 2001 From: Thayne McCombs Date: Thu, 28 Jan 2021 01:10:18 -0700 Subject: [PATCH 08/27] Add documentation of ss filter to man page This adds some documentation of the syntax for the FILTER argument to the ss command to the ss (8) man page. Signed-off-by: Thayne McCombs Signed-off-by: Stephen Hemminger --- man/man8/ss.8 | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/man/man8/ss.8 b/man/man8/ss.8 index e4b9cdcb..bcb0feee 100644 --- a/man/man8/ss.8 +++ b/man/man8/ss.8 @@ -440,6 +440,111 @@ states except for - opposite to .B bucket +.SH EXPRESSION + +.B EXPRESSION +allows filtering based on specific criteria. +.B EXPRESSION +consists of a series of predicates combined by boolean operators. The possible operators in increasing +order of precedence are +.B or +(or | or ||), +.B and +(or & or &&), and +.B not +(or !). If no operator is between consecutive predicates, an implicit +.B and +operator is assumed. Subexpressions can be grouped with "(" and ")". +.P +The following predicates are supported: + +.TP +.B {dst|src} [=] HOST +Test if the destination or source matches HOST. See HOST SYNTAX for details. +.TP +.B {dport|sport} [OP] [FAMILY:]:PORT +Compare the destination or source port to PORT. OP can be any of "<", "<=", "=", "!=", +">=" and ">". Following normal arithmetic rules. FAMILY and PORT are as described in +HOST SYNTAX below. +.TP +.B dev [=|!=] DEVICE +Match based on the device the connection uses. DEVICE can either be a device name or the +index of the interface. +.TP +.B fwmark [=|!=] MASK +Matches based on the fwmark value for the connection. This can either be a specific mark value +or a mark value followed by a "/" and a bitmask of which bits to use in the comparison. For example +"fwmark = 0x01/0x03" would match if the two least significant bits of the fwmark were 0x01. +.TP +.B cgroup [=|!=] PATH +Match if the connection is part of a cgroup at the given path. +.TP +.B autobound +Match if the port or path of the source address was automatically allocated +(rather than explicitly specified). +.P +Most operators have aliases. If no operator is supplied "=" is assumed. +Each of the following groups of operators are all equivalent: +.RS +.IP \(bu 2 += == eq +.IP \(bu +!= ne neq +.IP \(bu +> gt +.IP \(bu +< lt +.IP \(bu +>= ge geq +.IP \(bu +<= le leq +.IP \(bu +! not +.IP \(bu +| || or +.IP \(bu +& && and +.RE +.SH HOST SYNTAX +.P +The general host syntax is [FAMILY:]ADDRESS[:PORT]. +.P +FAMILY must be one of the families supported by the -f option. If not given +it defaults to the family given with the -f option, and if that is also +missing, will assume either inet or inet6. +.P +The form of ADDRESS and PORT depends on the family used. "*" can be used as +a wildcard for either the address or port. The details for each family are as +follows: +.TP +.B unix +ADDRESS is a glob pattern (see +.BR fnmatch (3)) +that will be matched case-insensitively against the unix socket's address. Both path and abstract +names are supported. Unix addresses do not support a port, and "*" cannot be used as a wildcard. +.TP +.B link +ADDRESS is the case-insensitive name of an Ethernet protocol to match. PORT +is either a device name or a device index for the desired link device, as seen +in the output of ip link. +.TP +.B netlink +ADDRESS is a descriptor of the netlink family. Possible values come from +/etc/iproute2/nl_protos. PORT is the port id of the socket, which is usually +the same as the owning process id. The value "kernel" can be used to represent +the kernel (port id of 0). +.TP +.B vsock +ADDRESS is an integer representing the CID address, and PORT is the port. +.TP +.BR inet \ and\ inet6 +ADDRESS is an ip address (either v4 or v6 depending on the family) or a DNS +hostname that resolves to an ip address of the required version. An ipv6 +address must be enclosed in "[" and "]" to disambiguate the port separator. The +address may additionally have a prefix length given in CIDR notation (a slash +followed by the prefix length in bits). PORT is either the numerical +socket port, or the service name for the port to match. + .SH USAGE EXAMPLES .TP .B ss -t -a From 38957a2f6c3ec2f69c8c444c77c3cafc296e23f1 Mon Sep 17 00:00:00 2001 From: Thayne McCombs Date: Tue, 2 Feb 2021 14:30:40 -0800 Subject: [PATCH 09/27] ss: Add clarification about host conditions with multiple familes to man In creating documentation for expressions I ran into an interesting case where if you use two different familie types in the expression, such as in `ss 'sport inet:ssh or src unix:/run/*'`, then you would only get the results for one address family (in this case unix sockets). The reason is that in parse_hostcond if the family is specified we remove any previously added families from filter->families, and preserve the "states" if any states are set. I tried changing this to not reset the families, but ran into some issues with Invalid Argument errors in inet_show_netlink, I think related to the state. I can dig into that more if supporting this is useful, but I'm not sure if these types of expressions would actually be useful in practice. Or perhaps an error should be given if an expression contains conditions with multiple families (besides inet and inet6)? Anyway, for now, this patch just notes the limitation in the man page. Signed-off-by: Thayne McCombs Signed-off-by: Stephen Hemminger --- man/man8/ss.8 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/man/man8/ss.8 b/man/man8/ss.8 index bcb0feee..42aac6de 100644 --- a/man/man8/ss.8 +++ b/man/man8/ss.8 @@ -511,7 +511,9 @@ The general host syntax is [FAMILY:]ADDRESS[:PORT]. .P FAMILY must be one of the families supported by the -f option. If not given it defaults to the family given with the -f option, and if that is also -missing, will assume either inet or inet6. +missing, will assume either inet or inet6. Note that all host conditions in the +expression should either all be the same family or be only inet and inet6. If there +is some other mixture of families, the results will probably be unexpected. .P The form of ADDRESS and PORT depends on the family used. "*" can be used as a wildcard for either the address or port. The details for each family are as From 5a37254b71249bfb73d44d6278d767a6b127a2f9 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Sun, 24 Jan 2021 17:36:58 +0000 Subject: [PATCH 10/27] iproute: force rtm_dst_len to 32/128 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since NETLINK_GET_STRICT_CHK was enabled, the kernel rejects commands that pass a prefix length, eg: ip route get `1.0.0.0/1 Error: ipv4: Invalid values in header for route get request. ip route get 0.0.0.0/0 Error: ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4 Since there's no point in setting a rtm_dst_len that we know is going to be rejected, just force it to the right value if it's passed on the command line. Print a warning to stderr to notify users. Bug-Debian: https://bugs.debian.org/944730 Reported-By: Clément 'wxcafé' Hertling Signed-off-by: Luca Boccassi Signed-off-by: Stephen Hemminger --- ip/iproute.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ip/iproute.c b/ip/iproute.c index ebb5f160..8d4d2ff8 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -2069,7 +2069,18 @@ static int iproute_get(int argc, char **argv) if (addr.bytelen) addattr_l(&req.n, sizeof(req), RTA_DST, &addr.data, addr.bytelen); - req.r.rtm_dst_len = addr.bitlen; + if (req.r.rtm_family == AF_INET && addr.bitlen != 32) { + fprintf(stderr, + "Warning: /%u as prefix is invalid, only /32 (or none) is supported.\n", + addr.bitlen); + req.r.rtm_dst_len = 32; + } else if (req.r.rtm_family == AF_INET6 && addr.bitlen != 128) { + fprintf(stderr, + "Warning: /%u as prefix is invalid, only /128 (or none) is supported.\n", + addr.bitlen); + req.r.rtm_dst_len = 128; + } else + req.r.rtm_dst_len = addr.bitlen; address_found = true; } argc--; argv++; From 2741208502e6ba5158334ee903708dc825759466 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 3 Feb 2021 08:16:16 -0800 Subject: [PATCH 11/27] uapi: pick up rpl.h fix Upstream change to fix byte order issues. Signed-off-by: Stephen Hemminger --- include/uapi/linux/rpl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/rpl.h b/include/uapi/linux/rpl.h index c24b64cd..72d60e09 100644 --- a/include/uapi/linux/rpl.h +++ b/include/uapi/linux/rpl.h @@ -28,10 +28,10 @@ struct ipv6_rpl_sr_hdr { pad:4, reserved1:16; #elif defined(__BIG_ENDIAN_BITFIELD) - __u32 reserved:20, + __u32 cmpri:4, + cmpre:4, pad:4, - cmpri:4, - cmpre:4; + reserved:20; #else #error "Please fix " #endif From 675e2df632d0c8ad1bf936506e2090cdc83b2f7e Mon Sep 17 00:00:00 2001 From: Ido Kalir Date: Sun, 14 Feb 2021 10:33:35 +0200 Subject: [PATCH 12/27] rdma: Fix statistics bind/unbing argument handling The dump isn't supported for the statistics bind/unbind commands because they operate on specific QP counters. This is different from query commands that can operate on many objects at the same time. Let's check the user input and ensure that arguments are valid. Fixes: a6d0773ebecc ("rdma: Add stat manual mode support") Signed-off-by: Ido Kalir Signed-off-by: Leon Romanovsky Signed-off-by: Stephen Hemminger --- rdma/rdma.h | 1 + rdma/stat.c | 21 +++++++++++++++++++++ rdma/utils.c | 7 +++++++ 3 files changed, 29 insertions(+) diff --git a/rdma/rdma.h b/rdma/rdma.h index fc8bcf09..ab3b388f 100644 --- a/rdma/rdma.h +++ b/rdma/rdma.h @@ -84,6 +84,7 @@ struct rd_cmd { * Parser interface */ bool rd_no_arg(struct rd *rd); +bool rd_is_multiarg(struct rd *rd); void rd_arg_inc(struct rd *rd); char *rd_argv(struct rd *rd); diff --git a/rdma/stat.c b/rdma/stat.c index a2b5da1c..75d45288 100644 --- a/rdma/stat.c +++ b/rdma/stat.c @@ -502,6 +502,12 @@ static int stat_get_arg(struct rd *rd, const char *arg) return -EINVAL; rd_arg_inc(rd); + + if (rd_is_multiarg(rd)) { + pr_err("The parameter %s shouldn't include range\n", arg); + return -EINVAL; + } + value = strtol(rd_argv(rd), &endp, 10); rd_arg_inc(rd); @@ -523,6 +529,8 @@ static int stat_one_qp_bind(struct rd *rd) return ret; lqpn = stat_get_arg(rd, "lqpn"); + if (lqpn < 0) + return lqpn; rd_prepare_msg(rd, RDMA_NLDEV_CMD_STAT_SET, &seq, (NLM_F_REQUEST | NLM_F_ACK)); @@ -537,6 +545,9 @@ static int stat_one_qp_bind(struct rd *rd) if (rd_argc(rd)) { cntn = stat_get_arg(rd, "cntn"); + if (cntn < 0) + return cntn; + mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn); } @@ -607,13 +618,23 @@ static int stat_one_qp_unbind(struct rd *rd) unsigned int portid; uint32_t seq; + if (rd_no_arg(rd)) { + stat_help(rd); + return -EINVAL; + } + ret = rd_build_filter(rd, stat_valid_filters); if (ret) return ret; cntn = stat_get_arg(rd, "cntn"); + if (cntn < 0) + return cntn; + if (rd_argc(rd)) { lqpn = stat_get_arg(rd, "lqpn"); + if (lqpn < 0) + return lqpn; return do_stat_qp_unbind_lqpn(rd, cntn, lqpn); } diff --git a/rdma/utils.c b/rdma/utils.c index 2a201aa4..f84b102d 100644 --- a/rdma/utils.c +++ b/rdma/utils.c @@ -47,6 +47,13 @@ bool rd_no_arg(struct rd *rd) return rd_argc(rd) == 0; } +bool rd_is_multiarg(struct rd *rd) +{ + if (!rd_argc(rd)) + return false; + return strpbrk(rd_argv(rd), ",-") != NULL; +} + /* * Possible input:output * dev/port | first port | is_dump_all From 1261459c64fbb5c3ceba3c0ec64e34a825af2fed Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 11 Feb 2021 12:44:57 +0200 Subject: [PATCH 13/27] man8/bridge.8: document the "permanent" flag for "bridge fdb add" The bridge program parses "local" and "permanent" in just the same way, so it makes sense to tell that to users: fdb_modify: } else if (matches(*argv, "local") == 0 || matches(*argv, "permanent") == 0) { req.ndm.ndm_state |= NUD_PERMANENT; Signed-off-by: Vladimir Oltean Signed-off-by: Stephen Hemminger --- man/man8/bridge.8 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index b3414ae3..12c09a56 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -517,6 +517,10 @@ the interface to which this address is associated. - is a local permanent fdb entry .sp +.B permanent +- this is a synonym for "local" +.sp + .B static - is a static (no arp) fdb entry .sp From ae3cb3d34d0bc6adab8eb67d905c6973a5285a1d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 11 Feb 2021 12:44:58 +0200 Subject: [PATCH 14/27] man8/bridge.8: document that "local" is default for "bridge fdb add" The bridge does this: fdb_modify: /* Assume permanent */ if (!(req.ndm.ndm_state&(NUD_PERMANENT|NUD_REACHABLE))) req.ndm.ndm_state |= NUD_PERMANENT; So let's make the user aware of the fact that if they don't want local entries, they need to specify some other flag like "static". Signed-off-by: Vladimir Oltean Signed-off-by: Stephen Hemminger --- man/man8/bridge.8 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index 12c09a56..223e65d6 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -514,7 +514,8 @@ the Ethernet MAC address. the interface to which this address is associated. .B local -- is a local permanent fdb entry +- is a local permanent fdb entry. This flag is default unless "static" or + "dynamic" are explicitly specified. .sp .B permanent From 10130bfafe8a7e679ecf3911fdec0111ffa51f01 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 11 Feb 2021 12:44:59 +0200 Subject: [PATCH 15/27] man8/bridge.8: explain what a local FDB entry is Explaining the "local" flag by saying that it is "a local permanent fdb entry" is not very helpful, be more specific. Signed-off-by: Vladimir Oltean Signed-off-by: Stephen Hemminger --- man/man8/bridge.8 | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index 223e65d6..b629c52b 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -514,8 +514,10 @@ the Ethernet MAC address. the interface to which this address is associated. .B local -- is a local permanent fdb entry. This flag is default unless "static" or - "dynamic" are explicitly specified. +- is a local permanent fdb entry, which means that the bridge will not forward +frames with this destination MAC address and VLAN ID, but terminate them +locally. This flag is default unless "static" or "dynamic" are explicitly +specified. .sp .B permanent From b64ceb687db6ee4a28d23a9a046646ced1623d39 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 11 Feb 2021 12:45:00 +0200 Subject: [PATCH 16/27] man8/bridge.8: fix which one of self/master is default for "bridge fdb" The bridge program does: fdb_modify: /* Assume self */ if (!(req.ndm.ndm_flags&(NTF_SELF|NTF_MASTER))) req.ndm.ndm_flags |= NTF_SELF; which is clearly against the documented behavior. The only thing we can do, sadly, is update the documentation. Signed-off-by: Vladimir Oltean Signed-off-by: Stephen Hemminger --- man/man8/bridge.8 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index b629c52b..1dc0aec8 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -533,11 +533,12 @@ specified. .sp .B self -- the address is associated with the port drivers fdb. Usually hardware. +- the address is associated with the port drivers fdb. Usually hardware + (default). .sp .B master -- the address is associated with master devices fdb. Usually software (default). +- the address is associated with master devices fdb. Usually software. .sp .B router From 14f528a556f806973f59efe5e1b95322b5300410 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 11 Feb 2021 12:45:01 +0200 Subject: [PATCH 17/27] man8/bridge.8: explain self vs master for "bridge fdb add" The "usually hardware" and "usually software" distinctions make no sense, try to clarify what these do based on the actual kernel behavior. Signed-off-by: Vladimir Oltean Signed-off-by: Stephen Hemminger --- man/man8/bridge.8 | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index 1dc0aec8..d0bcd708 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -533,12 +533,21 @@ specified. .sp .B self -- the address is associated with the port drivers fdb. Usually hardware - (default). +- the operation is fulfilled directly by the driver for the specified network +device. If the network device belongs to a master like a bridge, then the +bridge is bypassed and not notified of this operation (and if the device does +notify the bridge, it is driver-specific behavior and not mandated by this +flag, check the driver for more details). The "bridge fdb add" command can also +be used on the bridge device itself, and in this case, the added fdb entries +will be locally terminated (not forwarded). In the latter case, the "self" flag +is mandatory. The flag is set by default if "master" is not specified. .sp .B master -- the address is associated with master devices fdb. Usually software. +- if the specified network device is a port that belongs to a master device +such as a bridge, the operation is fulfilled by the master device's driver, +which may in turn notify the port driver too of the address. If the specified +device is a master itself, such as a bridge, this flag is invalid. .sp .B router From e1d79d49edb6e6ebf7e54b55948dfad150090592 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 11 Feb 2021 12:45:02 +0200 Subject: [PATCH 18/27] man8/bridge.8: be explicit that "flood" is an egress setting Talking to varios people, it became apparent that there is a certain ambiguity in the description of these flags. They refer to egress flooding, which should perhaps be stated more clearly. Signed-off-by: Vladimir Oltean Signed-off-by: Stephen Hemminger --- man/man8/bridge.8 | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index d0bcd708..9d8663bd 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -397,7 +397,8 @@ bridge FDB. .TP .BR "flood on " or " flood off " -Controls whether a given port will flood unicast traffic for which there is no FDB entry. By default this flag is on. +Controls whether unicast traffic for which there is no FDB entry will be +flooded towards this given port. By default this flag is on. .TP .B hwmode @@ -413,8 +414,8 @@ switch. .TP .BR "mcast_flood on " or " mcast_flood off " -Controls whether a given port will flood multicast traffic for which -there is no MDB entry. By default this flag is on. +Controls whether multicast traffic for which there is no MDB entry will be +flooded towards this given port. By default this flag is on. .TP .BR "mcast_to_unicast on " or " mcast_to_unicast off " From 546f738220e7d492d3ea47d4b85f11487a0dae87 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Mon, 22 Feb 2021 21:22:47 +0100 Subject: [PATCH 19/27] tc: m_gate: use SPRINT_BUF when needed sprint_time64() uses SPRINT_BSIZE-1 as a constant buffer lenght in its implementation, however m_gate uses shorter buffers when calling it. Fix this using SPRINT_BUF macro to get the buffer, thus getting a SPRINT_BSIZE-long buffer. Fixes: 07d5ee70b5b3 ("iproute2-next:tc:action: add a gate control action") Signed-off-by: Andrea Claudi Signed-off-by: Stephen Hemminger --- tc/m_gate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tc/m_gate.c b/tc/m_gate.c index 892775a3..c091ae19 100644 --- a/tc/m_gate.c +++ b/tc/m_gate.c @@ -427,7 +427,7 @@ static int print_gate_list(struct rtattr *list) __u32 index = 0, interval = 0; __u8 gate_state = 0; __s32 ipv = -1, maxoctets = -1; - char buf[22]; + SPRINT_BUF(buf); parse_rtattr_nested(tb, TCA_GATE_ENTRY_MAX, item); @@ -490,7 +490,7 @@ static int print_gate(struct action_util *au, FILE *f, struct rtattr *arg) __s64 base_time = 0; __s64 cycle_time = 0; __s64 cycle_time_ext = 0; - char buf[22]; + SPRINT_BUF(buf); int prio = -1; if (arg == NULL) From e833dbe140c529ab0b50eaa01ef37cb7b1c00e22 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Mon, 22 Feb 2021 21:23:01 +0100 Subject: [PATCH 20/27] ip: lwtunnel: seg6: bail out if table ids are invalid When table and vrftable are used in SRv6, ip should bail out if table ids are not valid, and return a proper error message to the user. Achieve this simply checking rtnl_rttable_a2n return value, as we already do in the rest of iproute. Fixes: 0486388a877a ("add support for table name in SRv6 End.DT* behaviors") Fixes: 69629b4e43c4 ("seg6: add support for vrftable attribute in SRv6 End.DT4/DT6 behaviors") Signed-off-by: Andrea Claudi Signed-off-by: Stephen Hemminger --- ip/iproute_lwtunnel.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c index 1ab95cd2..566fc7ea 100644 --- a/ip/iproute_lwtunnel.c +++ b/ip/iproute_lwtunnel.c @@ -891,13 +891,15 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp, NEXT_ARG(); if (table_ok++) duparg2("table", *argv); - rtnl_rttable_a2n(&table, *argv); + if (rtnl_rttable_a2n(&table, *argv)) + invarg("invalid table id\n", *argv); ret = rta_addattr32(rta, len, SEG6_LOCAL_TABLE, table); } else if (strcmp(*argv, "vrftable") == 0) { NEXT_ARG(); if (vrftable_ok++) duparg2("vrftable", *argv); - rtnl_rttable_a2n(&vrftable, *argv); + if (rtnl_rttable_a2n(&vrftable, *argv)) + invarg("invalid vrf table id\n", *argv); ret = rta_addattr32(rta, len, SEG6_LOCAL_VRFTABLE, vrftable); } else if (strcmp(*argv, "nh4") == 0) { From 1e25de9a9239cd6d7cc1d0ec3f7552d4ee7087e8 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Mon, 22 Feb 2021 12:40:36 +0100 Subject: [PATCH 21/27] lib/namespace: fix ip -all netns return code When ip -all netns {del,exec} are called and no netns is present, ip exit with status 0. However this does not happen if no netns has been created since boot time: in that case, indeed, the NETNS_RUN_DIR is not present and netns_foreach() exit with code 1. $ ls /var/run/netns ls: cannot access '/var/run/netns': No such file or directory $ ip -all netns exec ip link show $ echo $? 1 $ ip -all netns del $ echo $? 1 $ ip netns add test $ ip netns del test $ ip -all netns del $ echo $? 0 $ ls -a /var/run/netns . .. This leaves us in the unpleasant situation where the same command, when no netns is present, does the same stuff (in this case, nothing), but exit with two different statuses. Fix this treating ENOENT in a different way from other errors, similarly to what we already do in ipnetns.c netns_identify_pid() Fixes: e998e118ddc3 ("lib: Exec func on each netns") Reported-by: Jianlin Shi Signed-off-by: Andrea Claudi Signed-off-by: Stephen Hemminger --- lib/namespace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/namespace.c b/lib/namespace.c index 06ae0a48..45a7dedd 100644 --- a/lib/namespace.c +++ b/lib/namespace.c @@ -122,8 +122,14 @@ int netns_foreach(int (*func)(char *nsname, void *arg), void *arg) struct dirent *entry; dir = opendir(NETNS_RUN_DIR); - if (!dir) + if (!dir) { + if (errno == ENOENT) + return 0; + + fprintf(stderr, "Failed to open directory %s: %s\n", + NETNS_RUN_DIR, strerror(errno)); return -1; + } while ((entry = readdir(dir)) != NULL) { if (strcmp(entry->d_name, ".") == 0) From d4fcdbbec9df2fe287e443b5a69f622768c63126 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Mon, 22 Feb 2021 18:43:10 +0100 Subject: [PATCH 22/27] lib/bpf: Fix and simplify bpf_mnt_check_target() As stated in commit ac3415f5c1b1 ("lib/fs: Fix and simplify make_path()"), calling stat() before mkdir() is racey, because the entry might change in between. As the call to stat() seems to only check for target existence, we can simply call mkdir() unconditionally and catch all errors but EEXIST. Fixes: 95ae9a4870e7 ("bpf: fix mnt path when from env") Signed-off-by: Andrea Claudi --- lib/bpf_legacy.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/lib/bpf_legacy.c b/lib/bpf_legacy.c index bc869c3f..8a03b9c2 100644 --- a/lib/bpf_legacy.c +++ b/lib/bpf_legacy.c @@ -510,20 +510,14 @@ static int bpf_mnt_fs(const char *target) static int bpf_mnt_check_target(const char *target) { - struct stat sb = {}; int ret; - ret = stat(target, &sb); - if (ret) { - ret = mkdir(target, S_IRWXU); - if (ret) { - fprintf(stderr, "mkdir %s failed: %s\n", target, - strerror(errno)); - return ret; - } - } + ret = mkdir(target, S_IRWXU); + if (ret && errno != EEXIST) + fprintf(stderr, "mkdir %s failed: %s\n", target, + strerror(errno)); - return 0; + return ret; } static int bpf_valid_mntpt(const char *mnt, unsigned long magic) From 1de363b1800c371037ff2b2a6c1004627e58f68e Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Mon, 22 Feb 2021 19:14:31 +0100 Subject: [PATCH 23/27] lib/fs: avoid double call to mkdir on make_path() make_path() function calls mkdir two times in a row. The first one it stores mkdir return code, and then it calls it again to check for errno. This seems unnecessary, as we can use the return code from the first call and check for errno if not 0. Fixes: ac3415f5c1b1d ("lib/fs: Fix and simplify make_path()") Acked-by: Phil Sutter Signed-off-by: Andrea Claudi Signed-off-by: Stephen Hemminger --- lib/fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/fs.c b/lib/fs.c index 4b90a704..2ae506ec 100644 --- a/lib/fs.c +++ b/lib/fs.c @@ -253,7 +253,7 @@ int make_path(const char *path, mode_t mode) *delim = '\0'; rc = mkdir(dir, mode); - if (mkdir(dir, mode) != 0 && errno != EEXIST) { + if (rc && errno != EEXIST) { fprintf(stderr, "mkdir failed for %s: %s\n", dir, strerror(errno)); goto out; From b2d44b9a95270203dd3c2adb38f6c4ba549d4196 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Mon, 22 Feb 2021 19:14:32 +0100 Subject: [PATCH 24/27] lib/fs: Fix single return points for get_cgroup2_* Functions get_cgroup2_id() and get_cgroup2_path() may call close() with a negative argument. Avoid that making the calls conditional on the file descriptors. get_cgroup2_path() may also return NULL leaking a file descriptor. Ensure this does not happen using a single return point. Fixes: d5e6ee0dac64 ("ss: introduce cgroup2 cache and helper functions") Fixes: 8f1cd119b377 ("lib: fix checking of returned file handle size for cgroup") Signed-off-by: Andrea Claudi Signed-off-by: Stephen Hemminger --- lib/fs.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/lib/fs.c b/lib/fs.c index 2ae506ec..ee0b130b 100644 --- a/lib/fs.c +++ b/lib/fs.c @@ -157,7 +157,8 @@ __u64 get_cgroup2_id(const char *path) memcpy(cg_id.bytes, fhp->f_handle, sizeof(__u64)); out: - close(mnt_fd); + if (mnt_fd >= 0) + close(mnt_fd); free(mnt); return cg_id.id; @@ -179,16 +180,16 @@ char *get_cgroup2_path(__u64 id, bool full) char *path = NULL; char fd_path[64]; int link_len; - char *mnt; + char *mnt = NULL; if (!id) { fprintf(stderr, "Invalid cgroup2 ID\n"); - return NULL; + goto out; } mnt = find_cgroup2_mount(false); if (!mnt) - return NULL; + goto out; mnt_fd = open(mnt, O_RDONLY); if (mnt_fd < 0) { @@ -225,8 +226,10 @@ char *get_cgroup2_path(__u64 id, bool full) "Failed to allocate memory for cgroup2 path\n"); out: - close(fd); - close(mnt_fd); + if (fd >= 0) + close(fd); + if (mnt_fd >= 0) + close(mnt_fd); free(mnt); return path; From bbddfcec6c32781e5b4915ef4ce6b9b13eed82ef Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 23 Feb 2021 09:34:11 -0800 Subject: [PATCH 25/27] v5.11.0 --- include/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/version.h b/include/version.h index cf7afbbf..1a1f4f83 100644 --- a/include/version.h +++ b/include/version.h @@ -1 +1 @@ -static const char version[] = "5.10.0"; +static const char version[] = "5.11.0"; From 5e0e73c347a13692d6f8e266561abaea4c7b06ce Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 23 Feb 2021 23:10:51 -0800 Subject: [PATCH 26/27] Update kernel headers from 5.12-pre rc Signed-off-by: Stephen Hemminger --- include/uapi/linux/bpf.h | 103 +++++++++++++++++++++++++++++++++-- include/uapi/linux/mptcp.h | 2 + include/uapi/linux/pkt_cls.h | 1 + 3 files changed, 100 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1daeda13..9c135426 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1656,22 +1656,30 @@ union bpf_attr { * networking traffic statistics as it provides a global socket * identifier that can be assumed unique. * Return - * A 8-byte long non-decreasing number on success, or 0 if the - * socket field is missing inside *skb*. + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. * * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return @@ -2231,6 +2239,9 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + * * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. @@ -3836,6 +3847,69 @@ union bpf_attr { * Return * A pointer to a struct socket on success or NULL if the file is * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + + * Check ctx packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing an *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, why it is not considered a + * failure. Other BPF-helpers are needed for performing the + * planned size change, why the responsability for catch a negative + * packet size belong in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TX length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. On input *mtu_len* must be a valid + * pointer and be initialized (to zero), else verifier will reject + * BPF program. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4001,6 +4075,7 @@ union bpf_attr { FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ FN(sock_from_file), \ + FN(check_mtu), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4501,6 +4576,7 @@ struct bpf_prog_info { __aligned_u64 prog_tags; __u64 run_time_ns; __u64 run_cnt; + __u64 recursion_misses; } __attribute__((aligned(8))); struct bpf_map_info { @@ -4981,9 +5057,13 @@ struct bpf_fib_lookup { __be16 sport; __be16 dport; - /* total length of packet from network header - used for MTU check */ - __u16 tot_len; + union { /* used for MTU check */ + /* input to lookup */ + __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ + /* output: MTU value */ + __u16 mtu_result; + }; /* input: L3 device index for lookup * output: device index from FIB lookup */ @@ -5029,6 +5109,17 @@ struct bpf_redir_neigh { }; }; +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 58ce1c6a..c3e40165 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -103,6 +103,8 @@ struct mptcp_info { __u64 mptcpi_write_seq; __u64 mptcpi_snd_una; __u64 mptcpi_rcv_nxt; + __u8 mptcpi_local_addr_used; + __u8 mptcpi_local_addr_max; }; /* diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index afe6836e..7ea59cfe 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -593,6 +593,7 @@ enum { TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */ TCA_FLOWER_KEY_CT_FLAGS_REPLY = 1 << 5, /* Packet is in the reply direction. */ + __TCA_FLOWER_KEY_CT_FLAGS_MAX, }; enum { From 9d00602f82b56d890d16d3d607676bacf16f2fcc Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 23 Feb 2021 23:12:14 -0800 Subject: [PATCH 27/27] vdpa: add .gitignore Ignore the resulting binary vdpa. Signed-off-by: Stephen Hemminger --- vdpa/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 vdpa/.gitignore diff --git a/vdpa/.gitignore b/vdpa/.gitignore new file mode 100644 index 00000000..7ef28784 --- /dev/null +++ b/vdpa/.gitignore @@ -0,0 +1 @@ +vdpa