Merge branch 'main' into next

Signed-off-by: David Ahern <dsahern@kernel.org>
2021-03-01 00:07:57 +00:00 · 2021-03-01 00:07:57 +00:00 · 455c9f5361
parent c7897ec2a6 9d00602f82
commit 455c9f5361
30 changed files with 549 additions and 205 deletions
--- a/dcb/Makefile
+++ b/dcb/Makefile
@ -13,6 +13,7 @@ DCBOBJ = dcb.o \
         dcb_maxrate.o \
         dcb_pfc.o
 TARGETS += dcb
+LDLIBS += -lm

 endif

--- a/include/json_print.h
+++ b/include/json_print.h
@ -15,6 +15,9 @@
 #include "json_writer.h"
 #include "color.h"

+#define _IS_JSON_CONTEXT(type) (is_json_context() && (type & PRINT_JSON || type & PRINT_ANY))
+#define _IS_FP_CONTEXT(type)   (!is_json_context() && (type & PRINT_FP || type & PRINT_ANY))
+
 json_writer_t *get_json_writer(void);

 /*
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@ -1656,22 +1656,30 @@ union bpf_attr {
 * 		networking traffic statistics as it provides a global socket
 * 		identifier that can be assumed unique.
 * 	Return
- * 		A 8-byte long non-decreasing number on success, or 0 if the
- * 		socket field is missing inside *skb*.
+ * 		A 8-byte long unique number on success, or 0 if the socket
+ * 		field is missing inside *skb*.
 *
 * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
 * 	Description
 * 		Equivalent to bpf_get_socket_cookie() helper that accepts
 * 		*skb*, but gets socket from **struct bpf_sock_addr** context.
 * 	Return
- * 		A 8-byte long non-decreasing number.
+ * 		A 8-byte long unique number.
 *
 * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
 * 	Description
 * 		Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
 * 		*skb*, but gets socket from **struct bpf_sock_ops** context.
 * 	Return
- * 		A 8-byte long non-decreasing number.
+ * 		A 8-byte long unique number.
+ *
+ * u64 bpf_get_socket_cookie(struct sock *sk)
+ * 	Description
+ * 		Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
+ * 		*sk*, but gets socket from a BTF **struct sock**. This helper
+ * 		also works for sleepable programs.
+ * 	Return
+ * 		A 8-byte long unique number or 0 if *sk* is NULL.
 *
 * u32 bpf_get_socket_uid(struct sk_buff *skb)
 * 	Return
@ -2231,6 +2239,9 @@ union bpf_attr {
 *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
 *		  packet is not forwarded or needs assist from full stack
 *
+ *		If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU
+ *		was exceeded and output params->mtu_result contains the MTU.
+ *
 * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
 *	Description
 *		Add an entry to, or update a sockhash *map* referencing sockets.
@ -3836,6 +3847,69 @@ union bpf_attr {
 *	Return
 *		A pointer to a struct socket on success or NULL if the file is
 *		not a socket.
+ *
+ * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags)
+ *	Description
+
+ *		Check ctx packet size against exceeding MTU of net device (based
+ *		on *ifindex*).  This helper will likely be used in combination
+ *		with helpers that adjust/change the packet size.
+ *
+ *		The argument *len_diff* can be used for querying with a planned
+ *		size change. This allows to check MTU prior to changing packet
+ *		ctx. Providing an *len_diff* adjustment that is larger than the
+ *		actual packet size (resulting in negative packet size) will in
+ *		principle not exceed the MTU, why it is not considered a
+ *		failure.  Other BPF-helpers are needed for performing the
+ *		planned size change, why the responsability for catch a negative
+ *		packet size belong in those helpers.
+ *
+ *		Specifying *ifindex* zero means the MTU check is performed
+ *		against the current net device.  This is practical if this isn't
+ *		used prior to redirect.
+ *
+ *		The Linux kernel route table can configure MTUs on a more
+ *		specific per route level, which is not provided by this helper.
+ *		For route level MTU checks use the **bpf_fib_lookup**\ ()
+ *		helper.
+ *
+ *		*ctx* is either **struct xdp_md** for XDP programs or
+ *		**struct sk_buff** for tc cls_act programs.
+ *
+ *		The *flags* argument can be a combination of one or more of the
+ *		following values:
+ *
+ *		**BPF_MTU_CHK_SEGS**
+ *			This flag will only works for *ctx* **struct sk_buff**.
+ *			If packet context contains extra packet segment buffers
+ *			(often knows as GSO skb), then MTU check is harder to
+ *			check at this point, because in transmit path it is
+ *			possible for the skb packet to get re-segmented
+ *			(depending on net device features).  This could still be
+ *			a MTU violation, so this flag enables performing MTU
+ *			check against segments, with a different violation
+ *			return code to tell it apart. Check cannot use len_diff.
+ *
+ *		On return *mtu_len* pointer contains the MTU value of the net
+ *		device.  Remember the net device configured MTU is the L3 size,
+ *		which is returned here and XDP and TX length operate at L2.
+ *		Helper take this into account for you, but remember when using
+ *		MTU value in your BPF-code.  On input *mtu_len* must be a valid
+ *		pointer and be initialized (to zero), else verifier will reject
+ *		BPF program.
+ *
+ *	Return
+ *		* 0 on success, and populate MTU value in *mtu_len* pointer.
+ *
+ *		* < 0 if any input argument is invalid (*mtu_len* not updated)
+ *
+ *		MTU violations return positive values, but also populate MTU
+ *		value in *mtu_len* pointer, as this can be needed for
+ *		implementing PMTU handing:
+ *
+ *		* **BPF_MTU_CHK_RET_FRAG_NEEDED**
+ *		* **BPF_MTU_CHK_RET_SEGS_TOOBIG**
+ *
 */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@ -4001,6 +4075,7 @@ union bpf_attr {
 	FN(ktime_get_coarse_ns),	\
 	FN(ima_inode_hash),		\
 	FN(sock_from_file),		\
+	FN(check_mtu),			\
 	/* */

 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@ -4501,6 +4576,7 @@ struct bpf_prog_info {
 	__aligned_u64 prog_tags;
 	__u64 run_time_ns;
 	__u64 run_cnt;
+	__u64 recursion_misses;
 } __attribute__((aligned(8)));

 struct bpf_map_info {
@ -4981,9 +5057,13 @@ struct bpf_fib_lookup {
 	__be16	sport;
 	__be16	dport;

-	/* total length of packet from network header - used for MTU check */
-	__u16	tot_len;
+	union {	/* used for MTU check */
+		/* input to lookup */
+		__u16	tot_len; /* L3 length from network hdr (iph->tot_len) */

+		/* output: MTU value */
+		__u16	mtu_result;
+	};
 	/* input: L3 device index for lookup
 	 * output: device index from FIB lookup
 	 */
@ -5029,6 +5109,17 @@ struct bpf_redir_neigh {
 	};
 };

+/* bpf_check_mtu flags*/
+enum  bpf_check_mtu_flags {
+	BPF_MTU_CHK_SEGS  = (1U << 0),
+};
+
+enum bpf_check_mtu_ret {
+	BPF_MTU_CHK_RET_SUCCESS,      /* check and lookup successful */
+	BPF_MTU_CHK_RET_FRAG_NEEDED,  /* fragmentation required to fwd */
+	BPF_MTU_CHK_RET_SEGS_TOOBIG,  /* GSO re-segmentation needed to fwd */
+};
+
 enum bpf_task_fd_type {
 	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
 	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@ -103,6 +103,8 @@ struct mptcp_info {
 	__u64	mptcpi_write_seq;
 	__u64	mptcpi_snd_una;
 	__u64	mptcpi_rcv_nxt;
+	__u8	mptcpi_local_addr_used;
+	__u8	mptcpi_local_addr_max;
 };

 /*
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@ -593,6 +593,7 @@ enum {
 	TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */
 	TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */
 	TCA_FLOWER_KEY_CT_FLAGS_REPLY = 1 << 5, /* Packet is in the reply direction. */
+	__TCA_FLOWER_KEY_CT_FLAGS_MAX,
 };

 enum {
--- a/include/version.h
+++ b/include/version.h
@ -1 +1 @@
-static const char version[] = "5.10.0";
+static const char version[] = "5.11.0";
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@ -922,6 +922,7 @@ int print_linkinfo(struct nlmsghdr *n, void *arg)
 	const char *name;
 	unsigned int m_flag = 0;
 	SPRINT_BUF(b1);
+	bool truncated_vfs = false;

 	if (n->nlmsg_type != RTM_NEWLINK && n->nlmsg_type != RTM_DELLINK)
 		return 0;
@ -1199,15 +1200,18 @@ int print_linkinfo(struct nlmsghdr *n, void *arg)

 	if ((do_link || show_details) && tb[IFLA_VFINFO_LIST] && tb[IFLA_NUM_VF]) {
 		struct rtattr *i, *vflist = tb[IFLA_VFINFO_LIST];
-		int rem = RTA_PAYLOAD(vflist);
+		int rem = RTA_PAYLOAD(vflist), count = 0;

 		open_json_array(PRINT_JSON, "vfinfo_list");
 		for (i = RTA_DATA(vflist); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) {
 			open_json_object(NULL);
 			print_vfinfo(fp, ifi, i);
 			close_json_object();
+			count++;
 		}
 		close_json_array(PRINT_JSON, NULL);
+		if (count != rta_getattr_u32(tb[IFLA_NUM_VF]))
+			truncated_vfs = true;
 	}

 	if (tb[IFLA_PROP_LIST]) {
@ -1228,6 +1232,9 @@ int print_linkinfo(struct nlmsghdr *n, void *arg)

 	print_string(PRINT_FP, NULL, "%s", "\n");
 	fflush(fp);
+	/* prettier here if stderr and stdout go to the same place */
+	if (truncated_vfs)
+		fprintf(stderr, "Truncated VF list: %s\n", name);
 	return 1;
 }

--- a/ip/iplink_bareudp.c
+++ b/ip/iplink_bareudp.c
@ -22,9 +22,11 @@ static void print_explain(FILE *f)
 		"		[ srcportmin PORT ]\n"
 		"		[ [no]multiproto ]\n"
 		"\n"
-		"Where:	PORT       := 0-65535\n"
-		"	PROTO      := NUMBER | ip | mpls\n"
-		"	SRCPORTMIN := 0-65535\n"
+		"Where:	PORT  := UDP_PORT\n"
+		"	PROTO := ETHERTYPE\n"
+		"\n"
+		"Note: ETHERTYPE can be given as number or as protocol name (\"ipv4\", \"ipv6\",\n"
+		"      \"mpls_uc\", etc.).\n"
 	);
 }

--- a/ip/iproute.c
+++ b/ip/iproute.c
@ -2071,7 +2071,18 @@ static int iproute_get(int argc, char **argv)
 			if (addr.bytelen)
 				addattr_l(&req.n, sizeof(req),
 					  RTA_DST, &addr.data, addr.bytelen);
-			req.r.rtm_dst_len = addr.bitlen;
+			if (req.r.rtm_family == AF_INET && addr.bitlen != 32) {
+				fprintf(stderr,
+					"Warning: /%u as prefix is invalid, only /32 (or none) is supported.\n",
+					addr.bitlen);
+				req.r.rtm_dst_len = 32;
+			} else if (req.r.rtm_family == AF_INET6 && addr.bitlen != 128) {
+				fprintf(stderr,
+					"Warning: /%u as prefix is invalid, only /128 (or none) is supported.\n",
+					addr.bitlen);
+				req.r.rtm_dst_len = 128;
+			} else
+				req.r.rtm_dst_len = addr.bitlen;
 			address_found = true;
 		}
 		argc--; argv++;
--- a/ip/iproute_lwtunnel.c
+++ b/ip/iproute_lwtunnel.c
@ -891,13 +891,15 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 			NEXT_ARG();
 			if (table_ok++)
 				duparg2("table", *argv);
-			rtnl_rttable_a2n(&table, *argv);
+			if (rtnl_rttable_a2n(&table, *argv))
+				invarg("invalid table id\n", *argv);
 			ret = rta_addattr32(rta, len, SEG6_LOCAL_TABLE, table);
 		} else if (strcmp(*argv, "vrftable") == 0) {
 			NEXT_ARG();
 			if (vrftable_ok++)
 				duparg2("vrftable", *argv);
-			rtnl_rttable_a2n(&vrftable, *argv);
+			if (rtnl_rttable_a2n(&vrftable, *argv))
+				invarg("invalid vrf table id\n", *argv);
 			ret = rta_addattr32(rta, len, SEG6_LOCAL_VRFTABLE,
 					    vrftable);
 		} else if (strcmp(*argv, "nh4") == 0) {
--- a/ip/ipvrf.c
+++ b/ip/ipvrf.c
@ -278,8 +278,8 @@ static int vrf_configure_cgroup(const char *path, int ifindex)
 	 */
 	prog_fd = prog_load(ifindex);
 	if (prog_fd < 0) {
-		fprintf(stderr, "Failed to load BPF prog: '%s'\n",
-			strerror(errno));
+		fprintf(stderr, "Failed to load BPF prog: '%s'\n%s",
+			strerror(errno), bpf_log_buf);

 		if (errno != EPERM) {
 			fprintf(stderr,
--- a/lib/Makefile
+++ b/lib/Makefile
@ -3,8 +3,8 @@ include ../config.mk

 CFLAGS += -fPIC

-UTILOBJ = utils.o rt_names.o ll_map.o ll_types.o ll_proto.o ll_addr.o \
-	inet_proto.o namespace.o json_writer.o json_print.o \
+UTILOBJ = utils.o utils_math.o rt_names.o ll_map.o ll_types.o ll_proto.o ll_addr.o \
+	inet_proto.o namespace.o json_writer.o json_print.o json_print_math.o \
 	names.o color.o bpf_legacy.o bpf_glue.o exec.o fs.o cg_map.o

 ifeq ($(HAVE_ELF),y)
--- a/lib/bpf_glue.c
+++ b/lib/bpf_glue.c
@ -14,7 +14,8 @@ int bpf_program_load(enum bpf_prog_type type, const struct bpf_insn *insns,
 		     size_t size_log)
 {
 #ifdef HAVE_LIBBPF
-	return bpf_load_program(type, insns, size_insns, license, 0, log, size_log);
+	return bpf_load_program(type, insns, size_insns / sizeof(struct bpf_insn),
+				license, 0, log, size_log);
 #else
 	return bpf_prog_load_dev(type, insns, size_insns, license, 0, log, size_log);
 #endif
--- a/lib/bpf_legacy.c
+++ b/lib/bpf_legacy.c
@ -510,20 +510,14 @@ static int bpf_mnt_fs(const char *target)

 static int bpf_mnt_check_target(const char *target)
 {
-	struct stat sb = {};
 	int ret;

-	ret = stat(target, &sb);
-	if (ret) {
-		ret = mkdir(target, S_IRWXU);
-		if (ret) {
-			fprintf(stderr, "mkdir %s failed: %s\n", target,
-				strerror(errno));
-			return ret;
-		}
-	}
+	ret = mkdir(target, S_IRWXU);
+	if (ret && errno != EEXIST)
+		fprintf(stderr, "mkdir %s failed: %s\n", target,
+			strerror(errno));

-	return 0;
+	return ret;
 }

 static int bpf_valid_mntpt(const char *mnt, unsigned long magic)
--- a/lib/fs.c
+++ b/lib/fs.c
@ -157,7 +157,8 @@ __u64 get_cgroup2_id(const char *path)
 	memcpy(cg_id.bytes, fhp->f_handle, sizeof(__u64));

 out:
-	close(mnt_fd);
+	if (mnt_fd >= 0)
+		close(mnt_fd);
 	free(mnt);

 	return cg_id.id;
@ -179,16 +180,16 @@ char *get_cgroup2_path(__u64 id, bool full)
 	char *path = NULL;
 	char fd_path[64];
 	int link_len;
-	char *mnt;
+	char *mnt = NULL;

 	if (!id) {
 		fprintf(stderr, "Invalid cgroup2 ID\n");
-		return NULL;
+		goto out;
 	}

 	mnt = find_cgroup2_mount(false);
 	if (!mnt)
-		return NULL;
+		goto out;

 	mnt_fd = open(mnt, O_RDONLY);
 	if (mnt_fd < 0) {
@ -225,8 +226,10 @@ char *get_cgroup2_path(__u64 id, bool full)
 			"Failed to allocate memory for cgroup2 path\n");

 out:
-	close(fd);
-	close(mnt_fd);
+	if (fd >= 0)
+		close(fd);
+	if (mnt_fd >= 0)
+		close(mnt_fd);
 	free(mnt);

 	return path;
@ -253,7 +256,7 @@ int make_path(const char *path, mode_t mode)
 			*delim = '\0';

 		rc = mkdir(dir, mode);
-		if (mkdir(dir, mode) != 0 && errno != EEXIST) {
+		if (rc && errno != EEXIST) {
 			fprintf(stderr, "mkdir failed for %s: %s\n",
 				dir, strerror(errno));
 			goto out;
--- a/lib/json_print.c
+++ b/lib/json_print.c
@ -11,16 +11,12 @@

 #include <stdarg.h>
 #include <stdio.h>
-#include <math.h>

 #include "utils.h"
 #include "json_print.h"

 static json_writer_t *_jw;

-#define _IS_JSON_CONTEXT(type) ((type & PRINT_JSON || type & PRINT_ANY) && _jw)
-#define _IS_FP_CONTEXT(type) (!_jw && (type & PRINT_FP || type & PRINT_ANY))
-
 static void __new_json_obj(int json, bool have_array)
 {
 	if (json) {
@ -342,32 +338,3 @@ int print_color_rate(bool use_iec, enum output_type type, enum color_attr color,
 	free(buf);
 	return rc;
 }
-
-char *sprint_size(__u32 sz, char *buf)
-{
-	long kilo = 1024;
-	long mega = kilo * kilo;
-	size_t len = SPRINT_BSIZE - 1;
-	double tmp = sz;
-
-	if (sz >= mega && fabs(mega * rint(tmp / mega) - sz) < 1024)
-		snprintf(buf, len, "%gMb", rint(tmp / mega));
-	else if (sz >= kilo && fabs(kilo * rint(tmp / kilo) - sz) < 16)
-		snprintf(buf, len, "%gKb", rint(tmp / kilo));
-	else
-		snprintf(buf, len, "%ub", sz);
-
-	return buf;
-}
-
-int print_color_size(enum output_type type, enum color_attr color,
-		     const char *key, const char *fmt, __u32 sz)
-{
-	SPRINT_BUF(buf);
-
-	if (_IS_JSON_CONTEXT(type))
-		return print_color_uint(type, color, key, "%u", sz);
-
-	sprint_size(sz, buf);
-	return print_color_string(type, color, key, fmt, buf);
-}
--- a/lib/json_print_math.c
+++ b/lib/json_print_math.c
@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "utils.h"
+#include "json_print.h"
+
+char *sprint_size(__u32 sz, char *buf)
+{
+	long kilo = 1024;
+	long mega = kilo * kilo;
+	size_t len = SPRINT_BSIZE - 1;
+	double tmp = sz;
+
+	if (sz >= mega && fabs(mega * rint(tmp / mega) - sz) < 1024)
+		snprintf(buf, len, "%gMb", rint(tmp / mega));
+	else if (sz >= kilo && fabs(kilo * rint(tmp / kilo) - sz) < 16)
+		snprintf(buf, len, "%gKb", rint(tmp / kilo));
+	else
+		snprintf(buf, len, "%ub", sz);
+
+	return buf;
+}
+
+int print_color_size(enum output_type type, enum color_attr color,
+		     const char *key, const char *fmt, __u32 sz)
+{
+	SPRINT_BUF(buf);
+
+	if (_IS_JSON_CONTEXT(type))
+		return print_color_uint(type, color, key, "%u", sz);
+
+	sprint_size(sz, buf);
+	return print_color_string(type, color, key, fmt, buf);
+}
--- a/lib/namespace.c
+++ b/lib/namespace.c
@ -122,8 +122,14 @@ int netns_foreach(int (*func)(char *nsname, void *arg), void *arg)
 	struct dirent *entry;

 	dir = opendir(NETNS_RUN_DIR);
-	if (!dir)
+	if (!dir) {
+		if (errno == ENOENT)
+			return 0;
+
+		fprintf(stderr, "Failed to open directory %s: %s\n",
+			NETNS_RUN_DIR, strerror(errno));
 		return -1;
+	}

 	while ((entry = readdir(dir)) != NULL) {
 		if (strcmp(entry->d_name, ".") == 0)
--- a/lib/utils.c
+++ b/lib/utils.c
@ -513,120 +513,6 @@ int get_addr64(__u64 *ap, const char *cp)
 	return 1;
 }

-/* See http://physics.nist.gov/cuu/Units/binary.html */
-static const struct rate_suffix {
-	const char *name;
-	double scale;
-} suffixes[] = {
-	{ "bit",	1. },
-	{ "Kibit",	1024. },
-	{ "kbit",	1000. },
-	{ "mibit",	1024.*1024. },
-	{ "mbit",	1000000. },
-	{ "gibit",	1024.*1024.*1024. },
-	{ "gbit",	1000000000. },
-	{ "tibit",	1024.*1024.*1024.*1024. },
-	{ "tbit",	1000000000000. },
-	{ "Bps",	8. },
-	{ "KiBps",	8.*1024. },
-	{ "KBps",	8000. },
-	{ "MiBps",	8.*1024*1024. },
-	{ "MBps",	8000000. },
-	{ "GiBps",	8.*1024.*1024.*1024. },
-	{ "GBps",	8000000000. },
-	{ "TiBps",	8.*1024.*1024.*1024.*1024. },
-	{ "TBps",	8000000000000. },
-	{ NULL }
-};
-
-int get_rate(unsigned int *rate, const char *str)
-{
-	char *p;
-	double bps = strtod(str, &p);
-	const struct rate_suffix *s;
-
-	if (p == str)
-		return -1;
-
-	for (s = suffixes; s->name; ++s) {
-		if (strcasecmp(s->name, p) == 0) {
-			bps *= s->scale;
-			p += strlen(p);
-			break;
-		}
-	}
-
-	if (*p)
-		return -1; /* unknown suffix */
-
-	bps /= 8; /* -> bytes per second */
-	*rate = bps;
-	/* detect if an overflow happened */
-	if (*rate != floor(bps))
-		return -1;
-	return 0;
-}
-
-int get_rate64(__u64 *rate, const char *str)
-{
-	char *p;
-	double bps = strtod(str, &p);
-	const struct rate_suffix *s;
-
-	if (p == str)
-		return -1;
-
-	for (s = suffixes; s->name; ++s) {
-		if (strcasecmp(s->name, p) == 0) {
-			bps *= s->scale;
-			p += strlen(p);
-			break;
-		}
-	}
-
-	if (*p)
-		return -1; /* unknown suffix */
-
-	bps /= 8; /* -> bytes per second */
-	*rate = bps;
-	return 0;
-}
-
-int get_size(unsigned int *size, const char *str)
-{
-	double sz;
-	char *p;
-
-	sz = strtod(str, &p);
-	if (p == str)
-		return -1;
-
-	if (*p) {
-		if (strcasecmp(p, "kb") == 0 || strcasecmp(p, "k") == 0)
-			sz *= 1024;
-		else if (strcasecmp(p, "gb") == 0 || strcasecmp(p, "g") == 0)
-			sz *= 1024*1024*1024;
-		else if (strcasecmp(p, "gbit") == 0)
-			sz *= 1024*1024*1024/8;
-		else if (strcasecmp(p, "mb") == 0 || strcasecmp(p, "m") == 0)
-			sz *= 1024*1024;
-		else if (strcasecmp(p, "mbit") == 0)
-			sz *= 1024*1024/8;
-		else if (strcasecmp(p, "kbit") == 0)
-			sz *= 1024/8;
-		else if (strcasecmp(p, "b") != 0)
-			return -1;
-	}
-
-	*size = sz;
-
-	/* detect if an overflow happened */
-	if (*size != floor(sz))
-		return -1;
-
-	return 0;
-}
-
 static void set_address_type(inet_prefix *addr)
 {
 	switch (addr->family) {
--- a/lib/utils_math.c
+++ b/lib/utils_math.c
@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <asm/types.h>
+
+#include "utils.h"
+
+/* See http://physics.nist.gov/cuu/Units/binary.html */
+static const struct rate_suffix {
+	const char *name;
+	double scale;
+} suffixes[] = {
+	{ "bit",	1. },
+	{ "Kibit",	1024. },
+	{ "kbit",	1000. },
+	{ "mibit",	1024.*1024. },
+	{ "mbit",	1000000. },
+	{ "gibit",	1024.*1024.*1024. },
+	{ "gbit",	1000000000. },
+	{ "tibit",	1024.*1024.*1024.*1024. },
+	{ "tbit",	1000000000000. },
+	{ "Bps",	8. },
+	{ "KiBps",	8.*1024. },
+	{ "KBps",	8000. },
+	{ "MiBps",	8.*1024*1024. },
+	{ "MBps",	8000000. },
+	{ "GiBps",	8.*1024.*1024.*1024. },
+	{ "GBps",	8000000000. },
+	{ "TiBps",	8.*1024.*1024.*1024.*1024. },
+	{ "TBps",	8000000000000. },
+	{ NULL }
+};
+
+int get_rate(unsigned int *rate, const char *str)
+{
+	char *p;
+	double bps = strtod(str, &p);
+	const struct rate_suffix *s;
+
+	if (p == str)
+		return -1;
+
+	for (s = suffixes; s->name; ++s) {
+		if (strcasecmp(s->name, p) == 0) {
+			bps *= s->scale;
+			p += strlen(p);
+			break;
+		}
+	}
+
+	if (*p)
+		return -1; /* unknown suffix */
+
+	bps /= 8; /* -> bytes per second */
+	*rate = bps;
+	/* detect if an overflow happened */
+	if (*rate != floor(bps))
+		return -1;
+	return 0;
+}
+
+int get_rate64(__u64 *rate, const char *str)
+{
+	char *p;
+	double bps = strtod(str, &p);
+	const struct rate_suffix *s;
+
+	if (p == str)
+		return -1;
+
+	for (s = suffixes; s->name; ++s) {
+		if (strcasecmp(s->name, p) == 0) {
+			bps *= s->scale;
+			p += strlen(p);
+			break;
+		}
+	}
+
+	if (*p)
+		return -1; /* unknown suffix */
+
+	bps /= 8; /* -> bytes per second */
+	*rate = bps;
+	return 0;
+}
+
+int get_size(unsigned int *size, const char *str)
+{
+	double sz;
+	char *p;
+
+	sz = strtod(str, &p);
+	if (p == str)
+		return -1;
+
+	if (*p) {
+		if (strcasecmp(p, "kb") == 0 || strcasecmp(p, "k") == 0)
+			sz *= 1024;
+		else if (strcasecmp(p, "gb") == 0 || strcasecmp(p, "g") == 0)
+			sz *= 1024*1024*1024;
+		else if (strcasecmp(p, "gbit") == 0)
+			sz *= 1024*1024*1024/8;
+		else if (strcasecmp(p, "mb") == 0 || strcasecmp(p, "m") == 0)
+			sz *= 1024*1024;
+		else if (strcasecmp(p, "mbit") == 0)
+			sz *= 1024*1024/8;
+		else if (strcasecmp(p, "kbit") == 0)
+			sz *= 1024/8;
+		else if (strcasecmp(p, "b") != 0)
+			return -1;
+	}
+
+	*size = sz;
+
+	/* detect if an overflow happened */
+	if (*size != floor(sz))
+		return -1;
+
+	return 0;
+}
--- a/man/man8/bridge.8
+++ b/man/man8/bridge.8
@ -397,7 +397,8 @@ bridge FDB.

 .TP
 .BR "flood on " or " flood off "
-Controls whether a given port will flood unicast traffic for which there is no FDB entry. By default this flag is on.
+Controls whether unicast traffic for which there is no FDB entry will be
+flooded towards this given port. By default this flag is on.

 .TP
 .B hwmode
@ -413,8 +414,8 @@ switch.

 .TP
 .BR "mcast_flood on " or " mcast_flood off "
-Controls whether a given port will flood multicast traffic for which
-there is no MDB entry. By default this flag is on.
+Controls whether multicast traffic for which there is no MDB entry will be
+flooded towards this given port. By default this flag is on.

 .TP
 .BR "mcast_to_unicast on " or " mcast_to_unicast off "
@ -514,7 +515,14 @@ the Ethernet MAC address.
 the interface to which this address is associated.

 .B local
- is a local permanent fdb entry
+- is a local permanent fdb entry, which means that the bridge will not forward
+frames with this destination MAC address and VLAN ID, but terminate them
+locally. This flag is default unless "static" or "dynamic" are explicitly
+specified.
+.sp
+
+.B permanent
+- this is a synonym for "local"
 .sp

 .B static
@ -526,11 +534,21 @@ the interface to which this address is associated.
 .sp

 .B self
- the address is associated with the port drivers fdb. Usually hardware.
+- the operation is fulfilled directly by the driver for the specified network
+device. If the network device belongs to a master like a bridge, then the
+bridge is bypassed and not notified of this operation (and if the device does
+notify the bridge, it is driver-specific behavior and not mandated by this
+flag, check the driver for more details). The "bridge fdb add" command can also
+be used on the bridge device itself, and in this case, the added fdb entries
+will be locally terminated (not forwarded). In the latter case, the "self" flag
+is mandatory. The flag is set by default if "master" is not specified.
 .sp

 .B master
- the address is associated with master devices fdb. Usually software (default).
+- if the specified network device is a port that belongs to a master device
+such as a bridge, the operation is fulfilled by the master device's driver,
+which may in turn notify the port driver too of the address. If the specified
+device is a master itself, such as a bridge, this flag is invalid.
 .sp

 .B router
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@ -1307,9 +1307,9 @@ For a link of type
 the following additional arguments are supported:

 .BI "ip link add " DEVICE
-.BI type " bareudp " dstport " PORT " ethertype " ETHERTYPE"
+.BI type " bareudp " dstport " PORT " ethertype " PROTO"
 [
-.BI srcportmin " SRCPORTMIN "
+.BI srcportmin " PORT "
 ] [
 .RB [ no ] multiproto
 ]
@ -1320,11 +1320,14 @@ the following additional arguments are supported:
 - specifies the destination port for the UDP tunnel.

 .sp
-.BI ethertype " ETHERTYPE"
+.BI ethertype " PROTO"
 - specifies the ethertype of the L3 protocol being tunnelled.
+.B ethertype
+can be given as plain Ethernet protocol number or using the protocol name
+("ipv4", "ipv6", "mpls_uc", etc.).

 .sp
-.BI srcportmin " SRCPORTMIN"
+.BI srcportmin " PORT"
 - selects the lowest value of the UDP tunnel source port range.

 .sp
@ -1332,11 +1335,11 @@ the following additional arguments are supported:
 - activates support for protocols similar to the one
 .RB "specified by " ethertype .
 When
-.I ETHERTYPE
+.B ethertype
 is "mpls_uc" (that is, unicast MPLS), this allows the tunnel to also handle
 multicast MPLS.
 When
-.I ETHERTYPE
+.B ethertype
 is "ipv4", this allows the tunnel to also handle IPv6. This option is disabled
 by default.

--- a/man/man8/ss.8
+++ b/man/man8/ss.8
@ -440,6 +440,113 @@ states except for
 - opposite to
 .B bucket

+.SH EXPRESSION
+
+.B EXPRESSION
+allows filtering based on specific criteria.
+.B EXPRESSION
+consists of a series of predicates combined by boolean operators. The possible operators in increasing
+order of precedence are
+.B or
+(or | or ||),
+.B and
+(or & or &&), and
+.B not
+(or !). If no operator is between consecutive predicates, an implicit
+.B and
+operator is assumed. Subexpressions can be grouped with "(" and ")".
+.P
+The following predicates are supported:
+
+.TP
+.B {dst|src} [=] HOST
+Test if the destination or source matches HOST. See HOST SYNTAX for details.
+.TP
+.B {dport|sport} [OP] [FAMILY:]:PORT
+Compare the destination or source port to PORT. OP can be any of "<", "<=", "=", "!=",
+">=" and ">". Following normal arithmetic rules. FAMILY and PORT are as described in
+HOST SYNTAX below.
+.TP
+.B dev [=|!=] DEVICE
+Match based on the device the connection uses. DEVICE can either be a device name or the
+index of the interface.
+.TP
+.B fwmark [=|!=] MASK
+Matches based on the fwmark value for the connection. This can either be a specific mark value
+or a mark value followed by a "/" and a bitmask of which bits to use in the comparison. For example
+"fwmark = 0x01/0x03" would match if the two least significant bits of the fwmark were 0x01.
+.TP
+.B cgroup [=|!=] PATH
+Match if the connection is part of a cgroup at the given path.
+.TP
+.B autobound
+Match if the port or path of the source address was automatically allocated
+(rather than explicitly specified).
+.P
+Most operators have aliases. If no operator is supplied "=" is assumed.
+Each of the following groups of operators are all equivalent:
+.RS
+.IP \(bu 2
+= == eq
+.IP \(bu
+!= ne neq
+.IP \(bu
+> gt
+.IP \(bu
+< lt
+.IP \(bu
+>= ge geq
+.IP \(bu
+<= le leq
+.IP \(bu
+! not
+.IP \(bu
+| || or
+.IP \(bu
+& && and
+.RE
+.SH HOST SYNTAX
+.P
+The general host syntax is [FAMILY:]ADDRESS[:PORT].
+.P
+FAMILY must be one of the families supported by the -f option. If not given
+it defaults to the family given with the -f option, and if that is also
+missing, will assume either inet or inet6. Note that all host conditions in the
+expression should either all be the same family or be only inet and inet6. If there
+is some other mixture of families, the results will probably be unexpected.
+.P
+The form of ADDRESS and PORT depends on the family used. "*" can be used as
+a wildcard for either the address or port. The details for each family are as
+follows:
+.TP
+.B unix
+ADDRESS is a glob pattern (see
+.BR fnmatch (3))
+that will be matched case-insensitively against the unix socket's address. Both path and abstract
+names are supported. Unix addresses do not support a port, and "*" cannot be used as a wildcard.
+.TP
+.B link
+ADDRESS is the case-insensitive name of an Ethernet protocol to match. PORT
+is either a device name or a device index for the desired link device, as seen
+in the output of ip link.
+.TP
+.B netlink
+ADDRESS is a descriptor of the netlink family. Possible values come from
+/etc/iproute2/nl_protos. PORT is the port id of the socket, which is usually
+the same as the owning process id. The value "kernel" can be used to represent
+the kernel (port id of 0).
+.TP
+.B vsock
+ADDRESS is an integer representing the CID address, and PORT is the port.
+.TP
+.BR inet \ and\  inet6
+ADDRESS is an ip address (either v4 or v6 depending on the family) or a DNS
+hostname that resolves to an ip address of the required version. An ipv6
+address must be enclosed in "[" and "]" to disambiguate the port separator. The
+address may additionally have a prefix length given in CIDR notation (a slash
+followed by the prefix length in bits). PORT is either the numerical
+socket port, or the service name for the port to match.
+
 .SH USAGE EXAMPLES
 .TP
 .B ss -t -a
--- a/man/man8/tc-taprio.8
+++ b/man/man8/tc-taprio.8
@ -92,7 +92,11 @@ in the schedule;
 clockid
 .br
 Specifies the clock to be used by qdisc's internal timer for measuring
-time and scheduling events.
+time and scheduling events. This argument must be omitted when using the
+full-offload feature (flags 0x2), since in that case, the clockid is
+implicitly /dev/ptpN (where N is given by
+.B ethtool -T eth0 | grep 'PTP Hardware Clock'
+), and therefore not necessarily synchronized with the system's CLOCK_TAI.

 .TP
 sched-entry
@ -115,13 +119,27 @@ before moving to the next entry.
 .TP
 flags
 .br
-Specifies different modes for taprio. Currently, only txtime-assist is
-supported which can be enabled by setting it to 0x1. In this mode, taprio will
-set the transmit timestamp depending on the interval in which the packet needs
-to be transmitted. It will then utililize the
+This is a bit mask which specifies different modes for taprio.
+.RS
+.TP
+.I 0x1
+Enables the txtime-assist feature. In this mode, taprio will set the transmit
+timestamp depending on the interval in which the packet needs to be
+transmitted. It will then utililize the
 .BR etf(8)
 qdisc to sort and transmit the packets at the right time. The second example
 can be used as a reference to configure this mode.
+.TP
+.I 0x2
+Enables the full-offload feature. In this mode, taprio will pass the gate
+control list to the NIC which will execute it cyclically in hardware.
+When using full-offload, there is no need to specify the
+.B clockid
+argument.
+
+The txtime-assist and full-offload features are mutually exclusive, i.e.
+setting flags to 0x3 is invalid.
+.RE

 .TP
 txtime-delay
@ -178,5 +196,28 @@ for more information about configuring the ETF qdisc.
              offload delta 200000 clockid CLOCK_TAI
 .EE

+The following is a schedule in full offload mode. The
+.B base-time
+is 200 ns and the
+.B cycle-time
+is implicitly calculated as the sum of all
+.B sched-entry
+durations (i.e. 20 us + 20 us + 60 us = 100 us). Although the base-time is in
+the past, the hardware will start executing the schedule at a PTP time equal to
+the smallest integer multiple of 100 us, plus 200 ns, that is larger than the
+NIC's current PTP time.
+
+.EX
+# tc qdisc add dev eth0 parent root taprio \\
+              num_tc 8 \\
+              map 0 1 2 3 4 5 6 7 \\
+              queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \\
+              base-time 200 \\
+              sched-entry S 80 20000 \\
+              sched-entry S a0 20000 \\
+              sched-entry S df 60000 \\
+              flags 0x2
+.EE
+
 .SH AUTHORS
 Vinicius Costa Gomes <vinicius.gomes@intel.com>
--- a/misc/ss.c
+++ b/misc/ss.c
@ -3404,7 +3404,7 @@ static int tcpdiag_send(int fd, int protocol, struct filter *f)
 	struct iovec iov[3];
 	int iovlen = 1;

-	if (protocol == IPPROTO_UDP)
+	if (protocol == IPPROTO_UDP || protocol == IPPROTO_MPTCP)
 		return -1;

 	if (protocol == IPPROTO_TCP)
@ -3623,6 +3623,14 @@ static int inet_show_netlink(struct filter *f, FILE *dump_fp, int protocol)
 	if (preferred_family == PF_INET6)
 		family = PF_INET6;

+	/* extended protocol will use INET_DIAG_REQ_PROTOCOL,
+	 * not supported by older kernels. On such kernel
+	 * rtnl_dump will bail with rtnl_dump_error().
+	 * Suppress the error to avoid confusing the user
+	 */
+	if (protocol > 255)
+		rth.flags |= RTNL_HANDLE_F_SUPPRESS_NLERR;
+
 again:
 	if ((err = sockdiag_send(family, rth.fd, protocol, f)))
 		goto Exit;
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@ -85,6 +85,7 @@ struct rd_cmd {
 * Parser interface
 */
 bool rd_no_arg(struct rd *rd);
+bool rd_is_multiarg(struct rd *rd);
 void rd_arg_inc(struct rd *rd);

 char *rd_argv(struct rd *rd);
--- a/rdma/stat.c
+++ b/rdma/stat.c
@ -502,6 +502,12 @@ static int stat_get_arg(struct rd *rd, const char *arg)
 		return -EINVAL;

 	rd_arg_inc(rd);
+
+	if (rd_is_multiarg(rd)) {
+		pr_err("The parameter %s shouldn't include range\n", arg);
+		return -EINVAL;
+	}
+
 	value = strtol(rd_argv(rd), &endp, 10);
 	rd_arg_inc(rd);

@ -523,6 +529,8 @@ static int stat_one_qp_bind(struct rd *rd)
 		return ret;

 	lqpn = stat_get_arg(rd, "lqpn");
+	if (lqpn < 0)
+		return lqpn;

 	rd_prepare_msg(rd, RDMA_NLDEV_CMD_STAT_SET,
 		       &seq, (NLM_F_REQUEST | NLM_F_ACK));
@ -537,6 +545,9 @@ static int stat_one_qp_bind(struct rd *rd)

 	if (rd_argc(rd)) {
 		cntn = stat_get_arg(rd, "cntn");
+		if (cntn < 0)
+			return cntn;
+
 		mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_COUNTER_ID,
 				 cntn);
 	}
@ -607,13 +618,23 @@ static int stat_one_qp_unbind(struct rd *rd)
 	unsigned int portid;
 	uint32_t seq;

+	if (rd_no_arg(rd)) {
+		stat_help(rd);
+		return -EINVAL;
+	}
+
 	ret = rd_build_filter(rd, stat_valid_filters);
 	if (ret)
 		return ret;

 	cntn = stat_get_arg(rd, "cntn");
+	if (cntn < 0)
+		return cntn;
+
 	if (rd_argc(rd)) {
 		lqpn = stat_get_arg(rd, "lqpn");
+		if (lqpn < 0)
+			return lqpn;
 		return do_stat_qp_unbind_lqpn(rd, cntn, lqpn);
 	}

--- a/rdma/utils.c
+++ b/rdma/utils.c
@ -47,6 +47,13 @@ bool rd_no_arg(struct rd *rd)
 	return rd_argc(rd) == 0;
 }

+bool rd_is_multiarg(struct rd *rd)
+{
+	if (!rd_argc(rd))
+		return false;
+	return strpbrk(rd_argv(rd), ",-") != NULL;
+}
+
 /*
 * Possible input:output
 * dev/port    | first port | is_dump_all
--- a/tc/m_gate.c
+++ b/tc/m_gate.c
@ -427,7 +427,7 @@ static int print_gate_list(struct rtattr *list)
 		__u32 index = 0, interval = 0;
 		__u8 gate_state = 0;
 		__s32 ipv = -1, maxoctets = -1;
-		char buf[22];
+		SPRINT_BUF(buf);

 		parse_rtattr_nested(tb, TCA_GATE_ENTRY_MAX, item);

@ -490,7 +490,7 @@ static int print_gate(struct action_util *au, FILE *f, struct rtattr *arg)
 	__s64 base_time = 0;
 	__s64 cycle_time = 0;
 	__s64 cycle_time_ext = 0;
-	char buf[22];
+	SPRINT_BUF(buf);
 	int prio = -1;

 	if (arg == NULL)
--- a/vdpa/.gitignore
+++ b/vdpa/.gitignore
@ -0,0 +1 @@
+vdpa