From 3d65cefbefc86a53877f1e6461a9461e5b8fd7b3 Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Wed, 2 Jan 2019 11:57:00 +0800
Subject: [PATCH 01/19] iproute: Set ip/ip6 lwtunnel flags

ip l add dev tun type gretap external
ip r a 10.0.0.1 encap ip dst 192.168.152.171 id 1000 dev gretap

For gretap example when the command set the id but don't set the
TUNNEL_KEY flags. There is no key field in the send packet

User can set flags with key, csum, seq
ip r a 10.0.0.1 encap ip dst 192.168.152.171 id 1000 key csum dev gretap

Signed-off-by: wenxu <wenxu@ucloud.cn>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 ip/iproute_lwtunnel.c  | 58 +++++++++++++++++++++++++++++++++++++++++-
 man/man8/ip-route.8.in |  3 ++-
 2 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c
index aee18ac5..03217b8f 100644
--- a/ip/iproute_lwtunnel.c
+++ b/ip/iproute_lwtunnel.c
@@ -31,7 +31,7 @@
 #include <linux/seg6_iptunnel.h>
 #include <linux/seg6_hmac.h>
 #include <linux/seg6_local.h>
-#include <net/if.h>
+#include <linux/if_tunnel.h>
 
 static const char *format_encap_type(int type)
 {
@@ -294,6 +294,7 @@ static void print_encap_mpls(FILE *fp, struct rtattr *encap)
 static void print_encap_ip(FILE *fp, struct rtattr *encap)
 {
 	struct rtattr *tb[LWTUNNEL_IP_MAX+1];
+	__u16 flags;
 
 	parse_rtattr_nested(tb, LWTUNNEL_IP_MAX, encap);
 
@@ -318,6 +319,16 @@ static void print_encap_ip(FILE *fp, struct rtattr *encap)
 	if (tb[LWTUNNEL_IP_TOS])
 		print_uint(PRINT_ANY, "tos",
 			   "tos %d ", rta_getattr_u8(tb[LWTUNNEL_IP_TOS]));
+
+	if (tb[LWTUNNEL_IP_FLAGS]) {
+		flags = rta_getattr_u16(tb[LWTUNNEL_IP_FLAGS]);
+		if (flags & TUNNEL_KEY)
+			print_bool(PRINT_ANY, "key", "key ", true);
+		if (flags & TUNNEL_CSUM)
+			print_bool(PRINT_ANY, "csum", "csum ", true);
+		if (flags & TUNNEL_SEQ)
+			print_bool(PRINT_ANY, "seq", "seq ", true);
+	}
 }
 
 static void print_encap_ila(FILE *fp, struct rtattr *encap)
@@ -354,6 +365,7 @@ static void print_encap_ila(FILE *fp, struct rtattr *encap)
 static void print_encap_ip6(FILE *fp, struct rtattr *encap)
 {
 	struct rtattr *tb[LWTUNNEL_IP6_MAX+1];
+	__u16 flags;
 
 	parse_rtattr_nested(tb, LWTUNNEL_IP6_MAX, encap);
 
@@ -379,6 +391,16 @@ static void print_encap_ip6(FILE *fp, struct rtattr *encap)
 	if (tb[LWTUNNEL_IP6_TC])
 		print_uint(PRINT_ANY, "tc",
 			   "tc %u ", rta_getattr_u8(tb[LWTUNNEL_IP6_TC]));
+
+	if (tb[LWTUNNEL_IP6_FLAGS]) {
+		flags = rta_getattr_u16(tb[LWTUNNEL_IP6_FLAGS]);
+		if (flags & TUNNEL_KEY)
+			print_bool(PRINT_ANY, "key", "key ", true);
+		if (flags & TUNNEL_CSUM)
+			print_bool(PRINT_ANY, "csum", "csum ", true);
+		if (flags & TUNNEL_SEQ)
+			print_bool(PRINT_ANY, "seq", "seq ", true);
+	}
 }
 
 static void print_encap_bpf(FILE *fp, struct rtattr *encap)
@@ -777,9 +799,11 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 			  int *argcp, char ***argvp)
 {
 	int id_ok = 0, dst_ok = 0, src_ok = 0, tos_ok = 0, ttl_ok = 0;
+	int key_ok = 0, csum_ok = 0, seq_ok = 0;
 	char **argv = *argvp;
 	int argc = *argcp;
 	int ret = 0;
+	__u16 flags = 0;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "id") == 0) {
@@ -827,6 +851,18 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 			if (get_u8(&ttl, *argv, 0))
 				invarg("\"ttl\" value is invalid\n", *argv);
 			ret = rta_addattr8(rta, len, LWTUNNEL_IP_TTL, ttl);
+		} else if (strcmp(*argv, "key") == 0) {
+			if (key_ok++)
+				duparg2("key", *argv);
+			flags |= TUNNEL_KEY;
+		} else if (strcmp(*argv, "csum") == 0) {
+			if (csum_ok++)
+				duparg2("csum", *argv);
+			flags |= TUNNEL_CSUM;
+		} else if (strcmp(*argv, "seq") == 0) {
+			if (seq_ok++)
+				duparg2("seq", *argv);
+			flags |= TUNNEL_SEQ;
 		} else {
 			break;
 		}
@@ -835,6 +871,9 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 		argc--; argv++;
 	}
 
+	if (flags)
+		ret = rta_addattr16(rta, len,  LWTUNNEL_IP_FLAGS, flags);
+
 	/* argv is currently the first unparsed argument,
 	 * but the lwt_parse_encap() caller will move to the next,
 	 * so step back
@@ -927,9 +966,11 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 			   int *argcp, char ***argvp)
 {
 	int id_ok = 0, dst_ok = 0, src_ok = 0, tos_ok = 0, ttl_ok = 0;
+	int key_ok = 0, csum_ok = 0, seq_ok = 0;
 	char **argv = *argvp;
 	int argc = *argcp;
 	int ret = 0;
+	__u16 flags = 0;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "id") == 0) {
@@ -979,6 +1020,18 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 				       *argv);
 			ret = rta_addattr8(rta, len, LWTUNNEL_IP6_HOPLIMIT,
 					   hoplimit);
+		} else if (strcmp(*argv, "key") == 0) {
+			if (key_ok++)
+				duparg2("key", *argv);
+			flags |= TUNNEL_KEY;
+		} else if (strcmp(*argv, "csum") == 0) {
+			if (csum_ok++)
+				duparg2("csum", *argv);
+			flags |= TUNNEL_CSUM;
+		} else if (strcmp(*argv, "seq") == 0) {
+			if (seq_ok++)
+				duparg2("seq", *argv);
+			flags |= TUNNEL_SEQ;
 		} else {
 			break;
 		}
@@ -987,6 +1040,9 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 		argc--; argv++;
 	}
 
+	if (flags)
+		ret = rta_addattr16(rta, len,  LWTUNNEL_IP6_FLAGS, flags);
+
 	/* argv is currently the first unparsed argument,
 	 * but the lwt_parse_encap() caller will move to the next,
 	 * so step back
diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in
index 9603ac6e..b9ae6e30 100644
--- a/man/man8/ip-route.8.in
+++ b/man/man8/ip-route.8.in
@@ -737,7 +737,8 @@ is a set of encapsulation attributes specific to the
 .B tos
 .IR TOS " ] ["
 .B  ttl
-.IR TTL " ]"
+.IR TTL " ] [ "
+.BR key " ] [" csum " ] [ " seq " ] "
 .in -2
 .sp
 

From 28747146622a49c3e7b5c5b36dc02c6a64124770 Mon Sep 17 00:00:00 2001
From: Hans Dedecker <dedeckeh@gmail.com>
Date: Wed, 23 Jan 2019 22:02:31 +0100
Subject: [PATCH 02/19] f_flower: fix build with musl libc

XATTR_SIZE_MAX requires the usage of linux/limits.h; let's include it

Signed-off-by: Hans Dedecker <dedeckeh@gmail.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 tc/f_flower.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tc/f_flower.c b/tc/f_flower.c
index c5636667..9659e894 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -14,6 +14,7 @@
 #include <unistd.h>
 #include <string.h>
 #include <net/if.h>
+#include <linux/limits.h>
 #include <linux/if_arp.h>
 #include <linux/if_ether.h>
 #include <linux/ip.h>

From 17ed56fdf3bc2c5511bb9fa2f1e4487a3db721c0 Mon Sep 17 00:00:00 2001
From: Chris Mi <chrism@mellanox.com>
Date: Fri, 25 Jan 2019 10:37:07 +0000
Subject: [PATCH 03/19] libnetlink: linkdump_req: AF_PACKET family also expects
 ext_filter_mask

Without this fix, the VF info can't be showed using command
"ip link".

146: ens1f0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000
    link/ether 24:8a:07:ad:78:52 brd ff:ff:ff:ff:ff:ff
    vf 0 MAC 02:25:d0:12:01:01, spoof checking off, link-state auto, trust off, query_rss off
    vf 1 MAC 02:25:d0:12:01:02, spoof checking off, link-state auto, trust off, query_rss off

Fixes: d97b16b2c906 ("libnetlink: linkdump_req: Only AF_UNSPEC family expects an ext_filter_mask")

Signed-off-by: Chris Mi <chrism@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/libnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 110f47bc..3beb4342 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -476,7 +476,7 @@ int rtnl_linkdump_req_filter(struct rtnl_handle *rth, int family,
 int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int family,
 				req_filter_fn_t filter_fn)
 {
-	if (family == AF_UNSPEC) {
+	if (family == AF_UNSPEC || family == AF_PACKET) {
 		struct {
 			struct nlmsghdr nlh;
 			struct ifinfomsg ifm;

From 264be1d887102d47d725b299a1b74393259015dc Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 25 Jan 2019 17:09:17 +0000
Subject: [PATCH 04/19] bridge: fdb: Fix FDB dump with strict checking disabled

While iproute2 correctly uses ifinfomsg struct as the ancillary header
when requesting an FDB dump on old kernels, it sets the message type to
RTM_GETLINK. This results in wrong reply being returned.

Fix this by using RTM_GETNEIGH instead.

Before:
$ bridge fdb show brport dummy0
Not RTM_NEWNEIGH: 00000158 00000010 00000002

After:
$ bridge fdb show brport dummy0
2a:0b:41:1c:92:d3 vlan 1 master br0 permanent
2a:0b:41:1c:92:d3 master br0 permanent
33:33:00:00:00:01 self permanent
01:00:5e:00:00:01 self permanent

Fixes: 05880354c2cf ("bridge: fdb: Fix filtering with strict checking disabled")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: LiLiang <liali@redhat.com>
Acked-by: David Ahern <dsahern@gmail.com>
Acked-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 bridge/fdb.c         |  3 +--
 include/libnetlink.h |  3 +++
 lib/libnetlink.c     | 23 +++++++++++++++++++++++
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index f75e953a..c4bf4039 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -346,8 +346,7 @@ static int fdb_show(int argc, char **argv)
 	if (rth.flags & RTNL_HANDLE_F_STRICT_CHK)
 		rc = rtnl_neighdump_req(&rth, PF_BRIDGE, fdb_dump_filter);
 	else
-		rc = rtnl_linkdump_req_filter_fn(&rth, PF_BRIDGE,
-						 fdb_linkdump_filter);
+		rc = rtnl_fdb_linkdump_req_filter_fn(&rth, fdb_linkdump_filter);
 	if (rc < 0) {
 		perror("Cannot send dump request");
 		exit(1);
diff --git a/include/libnetlink.h b/include/libnetlink.h
index 0854d6ad..503b3ec1 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -81,6 +81,9 @@ int rtnl_linkdump_req_filter(struct rtnl_handle *rth, int fam, __u32 filt_mask)
 int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int fam,
 				req_filter_fn_t fn)
 	__attribute__((warn_unused_result));
+int rtnl_fdb_linkdump_req_filter_fn(struct rtnl_handle *rth,
+				    req_filter_fn_t filter_fn)
+	__attribute__((warn_unused_result));
 int rtnl_statsdump_req_filter(struct rtnl_handle *rth, int fam, __u32 filt_mask)
 	__attribute__((warn_unused_result));
 int rtnl_dump_request(struct rtnl_handle *rth, int type, void *req,
diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 3beb4342..1892a02a 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -503,6 +503,29 @@ int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int family,
 	return __rtnl_linkdump_req(rth, family);
 }
 
+int rtnl_fdb_linkdump_req_filter_fn(struct rtnl_handle *rth,
+				    req_filter_fn_t filter_fn)
+{
+	struct {
+		struct nlmsghdr nlh;
+		struct ifinfomsg ifm;
+		char buf[128];
+	} req = {
+		.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+		.nlh.nlmsg_type = RTM_GETNEIGH,
+		.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+		.nlh.nlmsg_seq = rth->dump = ++rth->seq,
+		.ifm.ifi_family = PF_BRIDGE,
+	};
+	int err;
+
+	err = filter_fn(&req.nlh, sizeof(req));
+	if (err)
+		return err;
+
+	return send(rth->fd, &req, sizeof(req), 0);
+}
+
 int rtnl_statsdump_req_filter(struct rtnl_handle *rth, int fam, __u32 filt_mask)
 {
 	struct {

From 3da6d055d93fefe40bf88a9bc37b4ce3433696ee Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 24 Jan 2019 16:41:07 -0800
Subject: [PATCH 05/19] bpf: add btf func and func_proto kind support

The issue is discovered for bpf selftest test_skb_cgroup.sh.
Currently we have,
  $ ./test_skb_cgroup_id.sh
  Wait for testing link-local IP to become available ... OK
  Object has unknown BTF type: 13!
  [PASS]

In the above the BTF type 13 refers to BTF kind
BTF_KIND_FUNC_PROTO.
This patch added support of BTF_KIND_FUNC_PROTO and
BTF_KIND_FUNC during type parsing.
With this patch, I got
  $ ./test_skb_cgroup_id.sh
  Wait for testing link-local IP to become available ... OK
  [PASS]

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/bpf.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/bpf.c b/lib/bpf.c
index 5e85cfc0..762f8857 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -2193,12 +2193,16 @@ static int bpf_btf_prep_type_data(struct bpf_elf_ctx *ctx)
 		case BTF_KIND_ENUM:
 			type_cur += var_len * sizeof(struct btf_enum);
 			break;
+		case BTF_KIND_FUNC_PROTO:
+			type_cur += var_len * sizeof(struct btf_param);
+			break;
 		case BTF_KIND_TYPEDEF:
 		case BTF_KIND_PTR:
 		case BTF_KIND_FWD:
 		case BTF_KIND_VOLATILE:
 		case BTF_KIND_CONST:
 		case BTF_KIND_RESTRICT:
+		case BTF_KIND_FUNC:
 			break;
 		default:
 			fprintf(stderr, "Object has unknown BTF type: %u!\n", kind);

From 2d603d55a8160aa40f0a442574f1fc8dedc9a034 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Wed, 6 Feb 2019 10:41:58 -0800
Subject: [PATCH 06/19] tc: fix memory leak in error path

If value passed to parse_percent was not valid, it would
leak the dynamic allocation from sscanf.

Fixes: 927e3cfb52b5 ("tc: B.W limits can now be specified in %.")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 tc/tc_util.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tc/tc_util.c b/tc/tc_util.c
index ab717890..1377b536 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -195,7 +195,7 @@ static int parse_percent_rate(char *rate, const char *str, const char *dev)
 	long dev_mbit;
 	int ret;
 	double perc, rate_mbit;
-	char *str_perc;
+	char *str_perc = NULL;
 
 	if (!dev[0]) {
 		fprintf(stderr, "No device specified; specify device to rate limit by percentage\n");
@@ -230,6 +230,7 @@ static int parse_percent_rate(char *rate, const char *str, const char *dev)
 	return 0;
 
 malf:
+	free(str_perc);
 	fprintf(stderr, "Specified rate value could not be read or is malformed\n");
 	return -1;
 }

From 817204d0b0ee98b0849902e5b20cc3e84460b900 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Wed, 6 Feb 2019 10:49:47 -0800
Subject: [PATCH 07/19] tc: avoid problems with hard coded rate string length

The parse_percent_rate function assumed the buffer was 20 characters.
Better to pass length in case the size ever changes.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 tc/tc_util.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tc/tc_util.c b/tc/tc_util.c
index 1377b536..4e289ae9 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -190,7 +190,8 @@ static const struct rate_suffix {
 	{ NULL }
 };
 
-static int parse_percent_rate(char *rate, const char *str, const char *dev)
+static int parse_percent_rate(char *rate, size_t len,
+			      const char *str, const char *dev)
 {
 	long dev_mbit;
 	int ret;
@@ -221,8 +222,8 @@ static int parse_percent_rate(char *rate, const char *str, const char *dev)
 
 	rate_mbit = perc * dev_mbit;
 
-	ret = snprintf(rate, 20, "%lf", rate_mbit);
-	if (ret <= 0 || ret >= 20) {
+	ret = snprintf(rate, len, "%lf", rate_mbit);
+	if (ret <= 0 || ret >= len) {
 		fprintf(stderr, "Unable to parse calculated rate\n");
 		return -1;
 	}
@@ -239,7 +240,7 @@ int get_percent_rate(unsigned int *rate, const char *str, const char *dev)
 {
 	char r_str[20];
 
-	if (parse_percent_rate(r_str, str, dev))
+	if (parse_percent_rate(r_str, sizeof(r_str), str, dev))
 		return -1;
 
 	return get_rate(rate, r_str);
@@ -249,7 +250,7 @@ int get_percent_rate64(__u64 *rate, const char *str, const char *dev)
 {
 	char r_str[20];
 
-	if (parse_percent_rate(r_str, str, dev))
+	if (parse_percent_rate(r_str, sizeof(r_str), str, dev))
 		return -1;
 
 	return get_rate64(rate, r_str);

From 9e46c5c2063f2f8aa775d4fa17de3a82daeea47d Mon Sep 17 00:00:00 2001
From: Marcos Antonio Moraes <marcos.antonio@digirati.com.br>
Date: Thu, 7 Feb 2019 13:29:54 -0200
Subject: [PATCH 08/19] tc: use bits not mbits/sec in rate percent

As /sys/class/net/<iface>/speed indicates a value in Mbits/sec, the
conversion is necessary to create the correct limits.

This guarantees the same result for the following commands in an
1000Mbit/sec device:

tc class add ... htb rate 500Mbit
tc class add ... htb rate 50%

Fixes: 927e3cfb52b5 ("tc: B.W limits can now be specified in %.")
Signed-off-by: Marcos Antonio Moraes <marcos.antonio@digirati.com.br>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 tc/tc_util.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tc/tc_util.c b/tc/tc_util.c
index 4e289ae9..07216fba 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -195,7 +195,7 @@ static int parse_percent_rate(char *rate, size_t len,
 {
 	long dev_mbit;
 	int ret;
-	double perc, rate_mbit;
+	double perc, rate_bit;
 	char *str_perc = NULL;
 
 	if (!dev[0]) {
@@ -220,9 +220,9 @@ static int parse_percent_rate(char *rate, size_t len,
 		return -1;
 	}
 
-	rate_mbit = perc * dev_mbit;
+	rate_bit = perc * dev_mbit * 1000 * 1000;
 
-	ret = snprintf(rate, len, "%lf", rate_mbit);
+	ret = snprintf(rate, len, "%lf", rate_bit);
 	if (ret <= 0 || ret >= len) {
 		fprintf(stderr, "Unable to parse calculated rate\n");
 		return -1;

From bb5ae621d0c7b9caf3a101903783bd5a1c997fa4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 Feb 2019 17:58:41 -0800
Subject: [PATCH 09/19] lib/libnetlink: ensure a minimum of 32KB for the buffer
 used in rtnl_recvmsg()

In the past, we tried to increase the buffer size up to 32 KB in order
to reduce number of syscalls per dump.

Commit 2d34851cd341 ("lib/libnetlink: re malloc buff if size is not enough")
brought the size back to 4KB because the kernel can not know the application
is ready to receive bigger requests.

See kernel commits 9063e21fb026 ("netlink: autosize skb lengthes") and
d35c99ff77ec ("netlink: do not enter direct reclaim from netlink_dump()")
for more details.

Fixes: 2d34851cd341 ("lib/libnetlink: re malloc buff if size is not enough")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Hangbin Liu <liuhangbin@gmail.com>
Cc: Phil Sutter <phil@nwl.cc>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/libnetlink.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 1892a02a..0d48a3d4 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -718,6 +718,8 @@ static int rtnl_recvmsg(int fd, struct msghdr *msg, char **answer)
 	if (len < 0)
 		return len;
 
+	if (len < 32768)
+		len = 32768;
 	buf = malloc(len);
 	if (!buf) {
 		fprintf(stderr, "malloc error: not enough buffer\n");

From 0f3f0ca3a2aef77b0e4009a8de31cb48f58993fc Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Wed, 13 Feb 2019 15:39:01 +0300
Subject: [PATCH 10/19] ss: add option --tos for requesting ipv4 tos and ipv6
 tclass

Also show socket class_id/priority used by classful qdisc.
Kernel report this together with tclass since commit
("inet_diag: fix reporting cgroup classid and fallback to priority")

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 man/man8/ss.8 | 17 +++++++++++++++++
 misc/ss.c     | 27 +++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/man/man8/ss.8 b/man/man8/ss.8
index 553a6cf4..9f21202d 100644
--- a/man/man8/ss.8
+++ b/man/man8/ss.8
@@ -244,6 +244,23 @@ the pacing rate and max pacing rate
 a helper variable for TCP internal auto tuning socket receive buffer
 .RE
 .TP
+.B \-\-tos
+Show ToS and priority information. Below fields may appear:
+.RS
+.P
+.TP
+.B tos
+IPv4 Type-of-Service byte
+.P
+.TP
+.B tclass
+IPv6 Traffic Class byte
+.P
+.TP
+.B class_id
+Class id set by net_cls cgroup. If class is zero this shows priority set by SO_PRIORITY.
+.RE
+.TP
 .B \-K, \-\-kill
 Attempts to forcibly close sockets. This option displays sockets that are
 successfully closed and silently skips sockets that the kernel does not support
diff --git a/misc/ss.c b/misc/ss.c
index 3589ebed..9e821faf 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -110,6 +110,7 @@ static int show_header = 1;
 static int follow_events;
 static int sctp_ino;
 static int show_tipcinfo;
+static int show_tos;
 
 enum col_id {
 	COL_NETID,
@@ -3008,6 +3009,15 @@ static int inet_show_sock(struct nlmsghdr *nlh,
 		}
 	}
 
+	if (show_tos) {
+		if (tb[INET_DIAG_TOS])
+			out(" tos:%#x", rta_getattr_u8(tb[INET_DIAG_TOS]));
+		if (tb[INET_DIAG_TCLASS])
+			out(" tclass:%#x", rta_getattr_u8(tb[INET_DIAG_TCLASS]));
+		if (tb[INET_DIAG_CLASS_ID])
+			out(" class_id:%#x", rta_getattr_u32(tb[INET_DIAG_CLASS_ID]));
+	}
+
 	if (show_mem || (show_tcpinfo && s->type != IPPROTO_UDP)) {
 		out("\n\t");
 		if (s->type == IPPROTO_SCTP)
@@ -3058,6 +3068,11 @@ static int tcpdiag_send(int fd, int protocol, struct filter *f)
 		req.r.idiag_ext |= (1<<(INET_DIAG_CONG-1));
 	}
 
+	if (show_tos) {
+		req.r.idiag_ext |= (1<<(INET_DIAG_TOS-1));
+		req.r.idiag_ext |= (1<<(INET_DIAG_TCLASS-1));
+	}
+
 	iov[0] = (struct iovec){
 		.iov_base = &req,
 		.iov_len = sizeof(req)
@@ -3118,6 +3133,11 @@ static int sockdiag_send(int family, int fd, int protocol, struct filter *f)
 		req.r.idiag_ext |= (1<<(INET_DIAG_CONG-1));
 	}
 
+	if (show_tos) {
+		req.r.idiag_ext |= (1<<(INET_DIAG_TOS-1));
+		req.r.idiag_ext |= (1<<(INET_DIAG_TCLASS-1));
+	}
+
 	iov[0] = (struct iovec){
 		.iov_base = &req,
 		.iov_len = sizeof(req)
@@ -4661,6 +4681,7 @@ static void _usage(FILE *dest)
 "   -i, --info          show internal TCP information\n"
 "       --tipcinfo      show internal tipc socket information\n"
 "   -s, --summary       show socket usage summary\n"
+"       --tos           show tos and priority information\n"
 "   -b, --bpf           show bpf filter socket information\n"
 "   -E, --events        continually display sockets as they are destroyed\n"
 "   -Z, --context       display process SELinux security contexts\n"
@@ -4765,6 +4786,8 @@ static int scan_state(const char *state)
 #define OPT_TIPCSOCK 257
 #define OPT_TIPCINFO 258
 
+#define OPT_TOS 259
+
 static const struct option long_opts[] = {
 	{ "numeric", 0, 0, 'n' },
 	{ "resolve", 0, 0, 'r' },
@@ -4800,6 +4823,7 @@ static const struct option long_opts[] = {
 	{ "contexts", 0, 0, 'z' },
 	{ "net", 1, 0, 'N' },
 	{ "tipcinfo", 0, 0, OPT_TIPCINFO},
+	{ "tos", 0, 0, OPT_TOS },
 	{ "kill", 0, 0, 'K' },
 	{ "no-header", 0, 0, 'H' },
 	{ 0 }
@@ -4977,6 +5001,9 @@ int main(int argc, char *argv[])
 		case OPT_TIPCINFO:
 			show_tipcinfo = 1;
 			break;
+		case OPT_TOS:
+			show_tos = 1;
+			break;
 		case 'K':
 			current_filter.kill = 1;
 			break;

From 619765fe14b640c9aa8eff73fa5bc32da3cbfe80 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Wed, 13 Feb 2019 15:40:30 +0100
Subject: [PATCH 11/19] iplink: document XDP subcommand to force the XDP mode.

When attaching an eBPF program to a device, ip link can force the XDP mode
by using the xdp{generic,drv,offload} keyword instead of just 'xdp'.
Document this behaviour also in the help output.

Signed-off-by: Matteo Croce <mcroce@redhat.com>
Fixes: 14683814 ("bpf: add xdpdrv for requesting XDP driver mode")
Fixes: 1b5e8094 ("bpf: allow requesting XDP HW offload")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 ip/iplink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ip/iplink.c b/ip/iplink.c
index b5519201..3a0cf459 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -98,7 +98,7 @@ void iplink_usage(void)
 		"				   [ trust { on | off} ] ]\n"
 		"				   [ node_guid { eui64 } ]\n"
 		"				   [ port_guid { eui64 } ]\n"
-		"			  [ xdp { off |\n"
+		"			  [ { xdp | xdpgeneric | xdpdrv | xdpoffload } { off |\n"
 		"				  object FILE [ section NAME ] [ verbose ] |\n"
 		"				  pinned FILE } ]\n"
 		"			  [ master DEVICE ][ vrf NAME ]\n"

From c2f9dc14c41f388764f7634d36c3d05e354f053a Mon Sep 17 00:00:00 2001
From: Luca Boccassi <bluca@debian.org>
Date: Thu, 14 Feb 2019 23:29:18 +0000
Subject: [PATCH 12/19] ip route: get: allow zero-length subnet mask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A /0 subnet mask is theoretically valid, but ip route get doesn't allow
it:

$ ip route get 1.0.0.0/0
need at least a destination address

Change the check and remember whether we found an address or not, since
according to the documentation it's a mandatory parameter.

$ ip/ip route get 1.0.0.0/0
1.0.0.0 via 192.168.1.1 dev eth0 src 192.168.1.91 uid 1000
    cache

Reported-by: Clément Hertling <wxcafe@wxcafe.net>
Signed-off-by: Luca Boccassi <bluca@debian.org>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 ip/iproute.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index 5f58a3b3..cc02a3e1 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -1932,6 +1932,7 @@ static int iproute_get(int argc, char **argv)
 	int fib_match = 0;
 	int from_ok = 0;
 	unsigned int mark = 0;
+	bool address_found = false;
 
 	iproute_reset_filter(0);
 	filter.cloned = 2;
@@ -2037,11 +2038,12 @@ static int iproute_get(int argc, char **argv)
 				addattr_l(&req.n, sizeof(req),
 					  RTA_DST, &addr.data, addr.bytelen);
 			req.r.rtm_dst_len = addr.bitlen;
+			address_found = true;
 		}
 		argc--; argv++;
 	}
 
-	if (req.r.rtm_dst_len == 0) {
+	if (!address_found) {
 		fprintf(stderr, "need at least a destination address\n");
 		return -1;
 	}

From f5f8e96953453c056872ac22a4a311e4931b3fb1 Mon Sep 17 00:00:00 2001
From: Thomas Haller <thaller@redhat.com>
Date: Tue, 19 Feb 2019 21:50:19 +0100
Subject: [PATCH 13/19] ip-rule: fix json key "to_tbl" for unspecific rule
 action

The key should not be called "to_tbl" because it is exactly
not a FR_ACT_TO_TBL action. Change it to "action".

    # ip rule add blackhole
    # ip -j rule | python -m json.tool
    ...
    {
        "priority": 0,
        "src": "all",
        "to_tbl": "blackhole"
    },

This is an API break of JSON output as it was added in v4.17.0.
Still change it as the API is relatively new and unstable.

Fixes: 0dd4ccc56c0e ("iprule: add json support")

Signed-off-by: Thomas Haller <thaller@redhat.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 ip/iprule.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ip/iprule.c b/ip/iprule.c
index 2f58d8c2..4e9437de 100644
--- a/ip/iprule.c
+++ b/ip/iprule.c
@@ -459,7 +459,7 @@ int print_rule(struct nlmsghdr *n, void *arg)
 	} else if (frh->action == FR_ACT_NOP) {
 		print_null(PRINT_ANY, "nop", "nop", NULL);
 	} else if (frh->action != FR_ACT_TO_TBL) {
-		print_string(PRINT_ANY, "to_tbl", "%s",
+		print_string(PRINT_ANY, "action", "%s",
 			     rtnl_rtntype_n2a(frh->action, b1, sizeof(b1)));
 	}
 

From d7cf2416fc3a08b411beffb93a9e118f6593892d Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 21 Feb 2019 19:37:51 +0100
Subject: [PATCH 14/19] ip-address: Use correct max attribute value in
 print_vf_stats64()

IFLA_VF_MAX is larger than the highest valid index in vf array.

Fixes: a1b99717c7cd7 ("Add displaying VF traffic statistics")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 ip/ipaddress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 2bc33f3a..76edf706 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -547,7 +547,7 @@ static void print_vf_stats64(FILE *fp, struct rtattr *vfstats)
 		return;
 	}
 
-	parse_rtattr_nested(vf, IFLA_VF_MAX, vfstats);
+	parse_rtattr_nested(vf, IFLA_VF_STATS_MAX, vfstats);
 
 	if (is_json_context()) {
 		open_json_object("stats");

From 0e7e1819453cc5bc5610c896d3cbc5a30b48b164 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 21 Feb 2019 11:55:56 +0100
Subject: [PATCH 15/19] devlink: relax dpipe table show dependency on resources

Dpipe table show command has a depencency on getting resources.
If resource get command is not supported by the driver, dpipe table
show fails. However, resource is only additional information
in dpipe table show output. So relax the dependency and let
the dpipe tables be shown even if resources get command fails.

Fixes: ead180274caf ("devlink: Add support for resource/dpipe relation")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 devlink/devlink.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 3651e90c..cced8d61 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -4351,7 +4351,8 @@ static int dpipe_table_show(struct dpipe_ctx *ctx, struct nlattr *nl)
 	size = mnl_attr_get_u32(nla_table[DEVLINK_ATTR_DPIPE_TABLE_SIZE]);
 	counters_enabled = !!mnl_attr_get_u8(nla_table[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]);
 
-	resource_valid = !!nla_table[DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID];
+	resource_valid = nla_table[DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID] &&
+			 ctx->resources;
 	if (resource_valid) {
 		table->resource_id = mnl_attr_get_u64(nla_table[DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID]);
 		table->resource_valid = true;
@@ -4467,12 +4468,9 @@ static int cmd_dpipe_table_show(struct dl *dl)
 	dl_opts_put(nlh, dl);
 	err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_resource_dump_cb,
 				  &resource_ctx);
-	if (err) {
-		pr_err("error get resources %s\n", strerror(resource_ctx.err));
-		goto err_resource_dump;
-	}
+	if (!err)
+		dpipe_ctx.resources = resource_ctx.resources;
 
-	dpipe_ctx.resources = resource_ctx.resources;
 	flags = NLM_F_REQUEST | NLM_F_ACK;
 	nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_DPIPE_TABLE_GET, flags);
 	dl_opts_put(nlh, dl);
@@ -4485,8 +4483,6 @@ static int cmd_dpipe_table_show(struct dl *dl)
 	dpipe_ctx_fini(&dpipe_ctx);
 	return 0;
 
-err_resource_dump:
-	resource_ctx_fini(&resource_ctx);
 err_resource_ctx_init:
 err_headers_get:
 	dpipe_ctx_fini(&dpipe_ctx);

From 02723cf230bfedb0918ae8a119d20cf4fd65091b Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Wed, 20 Feb 2019 11:33:57 -0500
Subject: [PATCH 16/19] bridge: make mcast_flood description consistent

This patch simply changes the description of the mcast_flood flag
with "flood" instead of "be flooded with" to avoid confusion, and be
consistent with the description of the flooding flag, which "Controls
whether a given port will *flood* unicast traffic for which there is
no FDB entry."

At the same time, fix the documentation for the "flood" flag which
is incorrectly described as "flooding on" or "flooding off".

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 man/man8/bridge.8     | 4 ++--
 man/man8/ip-link.8.in | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/man/man8/bridge.8 b/man/man8/bridge.8
index 72210f62..13c46386 100644
--- a/man/man8/bridge.8
+++ b/man/man8/bridge.8
@@ -344,7 +344,7 @@ Controls whether a given port will sync MAC addresses learned on device port to
 bridge FDB.
 
 .TP
-.BR "flooding on " or " flooding off "
+.BR "flood on " or " flood off "
 Controls whether a given port will flood unicast traffic for which there is no FDB entry. By default this flag is on.
 
 .TP
@@ -361,7 +361,7 @@ switch.
 
 .TP
 .BR "mcast_flood on " or " mcast_flood off "
-Controls whether a given port will be flooded with multicast traffic for which there is no MDB entry. By default this flag is on.
+Controls whether a given port will flood multicast traffic for which there is no MDB entry. By default this flag is on.
 
 .TP
 .BR "neigh_suppress on " or " neigh_suppress off "
diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index 73d37c19..6f31453c 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -2183,7 +2183,7 @@ queries.
 option above.
 
 .BR mcast_flood " { " on " | " off " }"
-- controls whether a given port will be flooded with multicast traffic for which there is no MDB entry.
+- controls whether a given port will flood multicast traffic for which there is no MDB entry.
 
 .BI group_fwd_mask " MASK "
 - set the group forward mask. This is the bitmask that is applied to decide whether to forward incoming frames destined to link-local addresses, ie addresses of the form 01:80:C2:00:00:0X (defaults to 0, ie the bridge does not forward any link-local frames coming on this port).

From 6f618a6a82a9b4cabec9aa5589e36efba339fd38 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Thu, 21 Feb 2019 14:24:07 -0800
Subject: [PATCH 17/19] uapi: update inet_diag_info.h

Upstream changes.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 include/uapi/linux/inet_diag.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index f98d82d4..f3bcd7ee 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -137,15 +137,21 @@ enum {
 	INET_DIAG_TCLASS,
 	INET_DIAG_SKMEMINFO,
 	INET_DIAG_SHUTDOWN,
-	INET_DIAG_DCTCPINFO,
-	INET_DIAG_PROTOCOL,  /* response attribute only */
+
+	/*
+	 * Next extenstions cannot be requested in struct inet_diag_req_v2:
+	 * its field idiag_ext has only 8 bits.
+	 */
+
+	INET_DIAG_DCTCPINFO,	/* request as INET_DIAG_VEGASINFO */
+	INET_DIAG_PROTOCOL,	/* response attribute only */
 	INET_DIAG_SKV6ONLY,
 	INET_DIAG_LOCALS,
 	INET_DIAG_PEERS,
 	INET_DIAG_PAD,
-	INET_DIAG_MARK,
-	INET_DIAG_BBRINFO,
-	INET_DIAG_CLASS_ID,
+	INET_DIAG_MARK,		/* only with CAP_NET_ADMIN */
+	INET_DIAG_BBRINFO,	/* request as INET_DIAG_VEGASINFO */
+	INET_DIAG_CLASS_ID,	/* request as INET_DIAG_TCLASS */
 	INET_DIAG_MD5SIG,
 	__INET_DIAG_MAX,
 };

From 9700927a008a803ac119bdf816bdc1baa69d705c Mon Sep 17 00:00:00 2001
From: Thomas De Schampheleire <thomas.de_schampheleire@nokia.com>
Date: Wed, 20 Feb 2019 15:41:51 +0100
Subject: [PATCH 18/19] ss: fix compilation under glibc < 2.18

Commit c759116a0b2b6da8df9687b0a40ac69050132c77 introduced support for
AF_VSOCK. This define is only provided since glibc version 2.18, so
compilation fails when using older toolchains.

Provide the necessary definitions if needed.

Signed-off-by: Thomas De Schampheleire <thomas.de_schampheleire@nokia.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 misc/ss.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/misc/ss.c b/misc/ss.c
index 9e821faf..766fdc5f 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -51,6 +51,14 @@
 #include <linux/tipc_netlink.h>
 #include <linux/tipc_sockets_diag.h>
 
+/* AF_VSOCK/PF_VSOCK is only provided since glibc 2.18 */
+#ifndef PF_VSOCK
+#define PF_VSOCK 40
+#endif
+#ifndef AF_VSOCK
+#define AF_VSOCK PF_VSOCK
+#endif
+
 #define MAGIC_SEQ 123456
 #define BUF_CHUNK (1024 * 1024)
 #define LEN_ALIGN(x) (((x) + 1) & ~1)

From aa5bd6a252ce46ee6757458f08a071aabdae9264 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 14 Feb 2019 01:58:32 +0100
Subject: [PATCH 19/19] ss: Render buffer to output every time a number of
 chunks are allocated

Eric reported that, with 10 million sockets, ss -emoi (about 1000 bytes
output per socket) can easily lead to OOM (buffer would grow to 10GB of
memory).

Limit the maximum size of the buffer to five chunks, 1M each. Render and
flush buffers whenever we reach that.

This might make the resulting blocks slightly unaligned between them, with
occasional loss of readability on lines occurring every 5k to 50k sockets
approximately. Something like (from ss -tu):

[...]
CLOSE-WAIT   32       0           192.168.1.50:35232           10.0.0.1:https
ESTAB        0        0           192.168.1.50:53820           10.0.0.1:https
ESTAB       0        0           192.168.1.50:46924            10.0.0.1:https
CLOSE-WAIT  32       0           192.168.1.50:35228            10.0.0.1:https
[...]

However, I don't actually expect any human user to scroll through that
amount of sockets, so readability should be preserved when it matters.

The bulk of the diffstat comes from moving field_next() around, as we now
call render() from it. Functionally, this is implemented by six lines of
code, most of them in field_next().

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Fixes: 691bd854bf4a ("ss: Buffer raw fields first, then render them as a table")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 misc/ss.c | 68 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/misc/ss.c b/misc/ss.c
index 766fdc5f..e9033503 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -60,7 +60,8 @@
 #endif
 
 #define MAGIC_SEQ 123456
-#define BUF_CHUNK (1024 * 1024)
+#define BUF_CHUNK (1024 * 1024)	/* Buffer chunk allocation size */
+#define BUF_CHUNKS_MAX 5	/* Maximum number of allocated buffer chunks */
 #define LEN_ALIGN(x) (((x) + 1) & ~1)
 
 #define DIAG_REQUEST(_req, _r)						    \
@@ -184,6 +185,7 @@ static struct {
 	struct buf_token *cur;	/* Position of current token in chunk */
 	struct buf_chunk *head;	/* First chunk */
 	struct buf_chunk *tail;	/* Current chunk */
+	int chunks;		/* Number of allocated chunks */
 } buffer;
 
 static const char *TCP_PROTO = "tcp";
@@ -944,6 +946,8 @@ static struct buf_chunk *buf_chunk_new(void)
 
 	new->end = buffer.cur->data;
 
+	buffer.chunks++;
+
 	return new;
 }
 
@@ -1088,33 +1092,6 @@ static int field_is_last(struct column *f)
 	return f - columns == COL_MAX - 1;
 }
 
-static void field_next(void)
-{
-	field_flush(current_field);
-
-	if (field_is_last(current_field))
-		current_field = columns;
-	else
-		current_field++;
-}
-
-/* Walk through fields and flush them until we reach the desired one */
-static void field_set(enum col_id id)
-{
-	while (id != current_field - columns)
-		field_next();
-}
-
-/* Print header for all non-empty columns */
-static void print_header(void)
-{
-	while (!field_is_last(current_field)) {
-		if (!current_field->disabled)
-			out("%s", current_field->header);
-		field_next();
-	}
-}
-
 /* Get the next available token in the buffer starting from the current token */
 static struct buf_token *buf_token_next(struct buf_token *cur)
 {
@@ -1140,6 +1117,7 @@ static void buf_free_all(void)
 		free(tmp);
 	}
 	buffer.head = NULL;
+	buffer.chunks = 0;
 }
 
 /* Get current screen width, default to 80 columns if TIOCGWINSZ fails */
@@ -1302,6 +1280,40 @@ static void render(void)
 	current_field = columns;
 }
 
+/* Move to next field, and render buffer if we reached the maximum number of
+ * chunks, at the last field in a line.
+ */
+static void field_next(void)
+{
+	if (field_is_last(current_field) && buffer.chunks >= BUF_CHUNKS_MAX) {
+		render();
+		return;
+	}
+
+	field_flush(current_field);
+	if (field_is_last(current_field))
+		current_field = columns;
+	else
+		current_field++;
+}
+
+/* Walk through fields and flush them until we reach the desired one */
+static void field_set(enum col_id id)
+{
+	while (id != current_field - columns)
+		field_next();
+}
+
+/* Print header for all non-empty columns */
+static void print_header(void)
+{
+	while (!field_is_last(current_field)) {
+		if (!current_field->disabled)
+			out("%s", current_field->header);
+		field_next();
+	}
+}
+
 static void sock_state_print(struct sockstat *s)
 {
 	const char *sock_name;