From 38dd041bfe773e481ebf9c8250e49c665af2e215 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Wed, 29 Jan 2020 15:56:40 +0100 Subject: [PATCH 01/14] ip-xfrm: Fix help messages After commit 8589eb4efdf2a ("treewide: refactor help messages") help messages for xfrm state and policy are broken, printing many times the same protocol in UPSPEC section: $ ip xfrm state help [...] UPSPEC := proto { { tcp | tcp | tcp | tcp } [ sport PORT ] [ dport PORT ] | { icmp | icmp | icmp } [ type NUMBER ] [ code NUMBER ] | gre [ key { DOTTED-QUAD | NUMBER } ] | PROTO } This happens because strxf_proto function is non-reentrant and gets called multiple times in the same fprintf instruction. This commit fix the issue avoiding calls to strxf_proto() with a constant param, just hardcoding strings for protocol names. Fixes: 8589eb4efdf2a ("treewide: refactor help messages") Signed-off-by: Andrea Claudi Signed-off-by: Stephen Hemminger --- ip/xfrm_policy.c | 21 +++------------------ ip/xfrm_state.c | 24 +++--------------------- 2 files changed, 6 insertions(+), 39 deletions(-) diff --git a/ip/xfrm_policy.c b/ip/xfrm_policy.c index 7c0233c1..d3c706d3 100644 --- a/ip/xfrm_policy.c +++ b/ip/xfrm_policy.c @@ -66,24 +66,9 @@ static void usage(void) "Usage: ip xfrm policy count\n" "Usage: ip xfrm policy set [ hthresh4 LBITS RBITS ] [ hthresh6 LBITS RBITS ]\n" "SELECTOR := [ src ADDR[/PLEN] ] [ dst ADDR[/PLEN] ] [ dev DEV ] [ UPSPEC ]\n" - "UPSPEC := proto { { "); - fprintf(stderr, "%s | %s | %s | %s } ", - strxf_proto(IPPROTO_TCP), - strxf_proto(IPPROTO_UDP), - strxf_proto(IPPROTO_SCTP), - strxf_proto(IPPROTO_DCCP)); - fprintf(stderr, - "[ sport PORT ] [ dport PORT ] |\n" - " { %s | %s | %s } ", - strxf_proto(IPPROTO_ICMP), - strxf_proto(IPPROTO_ICMPV6), - strxf_proto(IPPROTO_MH)); - fprintf(stderr, - "[ type NUMBER ] [ code NUMBER ] |\n" - " %s", - strxf_proto(IPPROTO_GRE)); - fprintf(stderr, - " [ key { DOTTED-QUAD | NUMBER } ] | PROTO }\n" + "UPSPEC := proto { { tcp | udp | sctp | dccp } [ sport PORT ] [ dport PORT ] |\n" + " { icmp | ipv6-icmp | mobility-header } [ type NUMBER ] [ code NUMBER ] |\n" + " gre [ key { DOTTED-QUAD | NUMBER } ] | PROTO }\n" "DIR := in | out | fwd\n" "PTYPE := main | sub\n" "ACTION := allow | block\n" diff --git a/ip/xfrm_state.c b/ip/xfrm_state.c index df2d50c3..16ff1931 100644 --- a/ip/xfrm_state.c +++ b/ip/xfrm_state.c @@ -106,27 +106,9 @@ static void usage(void) "EXTRA-FLAG-LIST := [ EXTRA-FLAG-LIST ] EXTRA-FLAG\n" "EXTRA-FLAG := dont-encap-dscp\n" "SELECTOR := [ src ADDR[/PLEN] ] [ dst ADDR[/PLEN] ] [ dev DEV ] [ UPSPEC ]\n" - "UPSPEC := proto { { "); - fprintf(stderr, - "%s | %s | %s | %s", - strxf_proto(IPPROTO_TCP), - strxf_proto(IPPROTO_UDP), - strxf_proto(IPPROTO_SCTP), - strxf_proto(IPPROTO_DCCP)); - fprintf(stderr, - " } [ sport PORT ] [ dport PORT ] |\n" - " { "); - fprintf(stderr, - "%s | %s | %s", - strxf_proto(IPPROTO_ICMP), - strxf_proto(IPPROTO_ICMPV6), - strxf_proto(IPPROTO_MH)); - fprintf(stderr, - " } [ type NUMBER ] [ code NUMBER ] |\n"); - fprintf(stderr, - " %s", strxf_proto(IPPROTO_GRE)); - fprintf(stderr, - " [ key { DOTTED-QUAD | NUMBER } ] | PROTO }\n" + "UPSPEC := proto { { tcp | udp | sctp | dccp } [ sport PORT ] [ dport PORT ] |\n" + " { icmp | ipv6-icmp | mobility-header } [ type NUMBER ] [ code NUMBER ] |\n" + " gre [ key { DOTTED-QUAD | NUMBER } ] | PROTO }\n" "LIMIT-LIST := [ LIMIT-LIST ] limit LIMIT\n" "LIMIT := { time-soft | time-hard | time-use-soft | time-use-hard } SECONDS |\n" " { byte-soft | byte-hard } SIZE | { packet-soft | packet-hard } COUNT\n" From 5cdeb77cd6ec26f0a7103dfb21494a6a43903206 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Wed, 29 Jan 2020 15:31:11 +0100 Subject: [PATCH 02/14] ip link: xstats: fix TX IGMP reports string This restore the string format we have before jsonification, adding a missing space between v2 and v3 on TX IGMP reports string. Fixes: a9bc23a79227a ("ip: bridge: add xstats json support") Signed-off-by: Andrea Claudi Signed-off-by: Stephen Hemminger --- ip/iplink_bridge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/iplink_bridge.c b/ip/iplink_bridge.c index bbd6f3a8..3e81aa05 100644 --- a/ip/iplink_bridge.c +++ b/ip/iplink_bridge.c @@ -743,7 +743,7 @@ static void bridge_print_stats_attr(struct rtattr *attr, int ifindex) print_string(PRINT_FP, NULL, "%-16s ", ""); print_u64(PRINT_ANY, "tx_v1", "TX: v1 %llu ", mstats->igmp_v1reports[BR_MCAST_DIR_TX]); - print_u64(PRINT_ANY, "tx_v2", "v2 %llu", + print_u64(PRINT_ANY, "tx_v2", "v2 %llu ", mstats->igmp_v2reports[BR_MCAST_DIR_TX]); print_u64(PRINT_ANY, "tx_v3", "v3 %llu\n", mstats->igmp_v3reports[BR_MCAST_DIR_TX]); From 8f9f2b9cdfbd1c7988542d81db1db854d48f2b0d Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 2 Feb 2020 04:20:58 -0800 Subject: [PATCH 03/14] devlink: fix warning from unchecked write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Warning seen on Ubuntu devlink.c: In function ‘cmd_dev_flash’: devlink.c:3071:3: warning: ignoring return value of ‘write’, declared with attribute warn_unused_result [-Wunused-result] 3071 | write(pipe_w, &err, sizeof(err)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: 9b13cddfe268 ("devlink: implement flash status monitoring") Signed-off-by: Stephen Hemminger --- devlink/devlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/devlink/devlink.c b/devlink/devlink.c index 73ce9865..f9e58c1d 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -3066,11 +3066,13 @@ static int cmd_dev_flash(struct dl *dl) /* In child, just execute the flash and pass returned * value through pipe once it is done. */ + int cc; + close(pipe_r); err = _mnlg_socket_send(dl->nlg, nlh); - write(pipe_w, &err, sizeof(err)); + cc = write(pipe_w, &err, sizeof(err)); close(pipe_w); - exit(0); + exit(cc != sizeof(err)); } close(pipe_w); From 39995691b51c448b4d58845efc32b65dafcc48d7 Mon Sep 17 00:00:00 2001 From: Peter Junos Date: Wed, 29 Jan 2020 23:33:48 +0100 Subject: [PATCH 04/14] ss: fix tests to reflect compact output This fixes broken tests in commit c4f5862994589 ("ss: use compact output for undetected screen width") It also escapes stars as grep is used and more bugs could sneak under the radar with the previous solution. Signed-off-by: Peter Junos Signed-off-by: Stephen Hemminger --- testsuite/tests/ss/ssfilter.t | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/testsuite/tests/ss/ssfilter.t b/testsuite/tests/ss/ssfilter.t index 3091054f..4c2315ca 100755 --- a/testsuite/tests/ss/ssfilter.t +++ b/testsuite/tests/ss/ssfilter.t @@ -12,37 +12,37 @@ export TCPDIAG_FILE="$(dirname $0)/ss1.dump" ts_log "[Testing ssfilter]" ts_ss "$0" "Match dport = 22" -Htna dport = 22 -test_on "ESTAB 0 0 10.0.0.1:36266 10.0.0.1:22" +test_on "ESTAB 0 0 10.0.0.1:36266 10.0.0.1:22" ts_ss "$0" "Match dport 22" -Htna dport 22 -test_on "ESTAB 0 0 10.0.0.1:36266 10.0.0.1:22" +test_on "ESTAB 0 0 10.0.0.1:36266 10.0.0.1:22" ts_ss "$0" "Match (dport)" -Htna '( dport = 22 )' -test_on "ESTAB 0 0 10.0.0.1:36266 10.0.0.1:22" +test_on "ESTAB 0 0 10.0.0.1:36266 10.0.0.1:22" ts_ss "$0" "Match src = 0.0.0.0" -Htna src = 0.0.0.0 -test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:*" +test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:\*" ts_ss "$0" "Match src 0.0.0.0" -Htna src 0.0.0.0 -test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:*" +test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:\*" ts_ss "$0" "Match src sport" -Htna src 0.0.0.0 sport = 22 -test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:*" +test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:\*" ts_ss "$0" "Match src and sport" -Htna src 0.0.0.0 and sport = 22 -test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:*" +test_on "LISTEN 0 128 0.0.0.0:22 0.0.0.0:\*" ts_ss "$0" "Match src and sport and dport" -Htna src 10.0.0.1 and sport = 22 and dport = 50312 -test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" +test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" ts_ss "$0" "Match src and sport and (dport)" -Htna 'src 10.0.0.1 and sport = 22 and ( dport = 50312 )' -test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" +test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" ts_ss "$0" "Match src and (sport and dport)" -Htna 'src 10.0.0.1 and ( sport = 22 and dport = 50312 )' -test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" +test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" ts_ss "$0" "Match (src and sport) and dport" -Htna '( src 10.0.0.1 and sport = 22 ) and dport = 50312' -test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" +test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" ts_ss "$0" "Match (src or src) and dst" -Htna '( src 0.0.0.0 or src 10.0.0.1 ) and dst 10.0.0.2' -test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" +test_on "ESTAB 0 0 10.0.0.1:22 10.0.0.2:50312" From 9dced637f8dd269e0a409eb2eec1404d54d517f4 Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Tue, 4 Feb 2020 16:19:19 +0530 Subject: [PATCH 05/14] tc: add support for FQ-PIE packet scheduler This patch adds support for the FQ-PIE packet Scheduler Principles: - Packets are classified on flows. - This is a Stochastic model (as we use a hash, several flows might be hashed to the same slot) - Each flow has a PIE managed queue. - Flows are linked onto two (Round Robin) lists, so that new flows have priority on old ones. - For a given flow, packets are not reordered. - Drops during enqueue only. - ECN capability is off by default. - ECN threshold (if ECN is enabled) is at 10% by default. - Uses timestamps to calculate queue delay by default. Usage: tc qdisc ... fq_pie [ limit PACKETS ] [ flows NUMBER ] [ target TIME ] [ tupdate TIME ] [ alpha NUMBER ] [ beta NUMBER ] [ quantum BYTES ] [ memory_limit BYTES ] [ ecn_prob PERCENTAGE ] [ [no]ecn ] [ [no]bytemode ] [ [no_]dq_rate_estimator ] defaults: limit: 10240 packets, flows: 1024 target: 15 ms, tupdate: 15 ms (in jiffies) alpha: 1/8, beta : 5/4 quantum: device MTU, memory_limit: 32 Mb ecnprob: 10%, ecn: off bytemode: off, dq_rate_estimator: off Signed-off-by: Mohit P. Tahiliani Signed-off-by: Sachin D. Patil Signed-off-by: V. Saicharan Signed-off-by: Mohit Bhasi Signed-off-by: Leslie Monis Signed-off-by: Gautam Ramakrishnan Signed-off-by: Stephen Hemminger --- bash-completion/tc | 12 +- man/man8/tc-fq_pie.8 | 166 ++++++++++++++++++++++ man/man8/tc.8 | 8 ++ tc/Makefile | 1 + tc/q_fq_pie.c | 318 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 503 insertions(+), 2 deletions(-) create mode 100644 man/man8/tc-fq_pie.8 create mode 100644 tc/q_fq_pie.c diff --git a/bash-completion/tc b/bash-completion/tc index fe0d51ec..086cb7f6 100644 --- a/bash-completion/tc +++ b/bash-completion/tc @@ -3,8 +3,8 @@ # Copyright 2016 Quentin Monnet QDISC_KIND=' choke codel bfifo pfifo pfifo_head_drop fq fq_codel gred hhf \ - mqprio multiq netem pfifo_fast pie red rr sfb sfq tbf atm cbq drr \ - dsmark hfsc htb prio qfq ' + mqprio multiq netem pfifo_fast pie fq_pie red rr sfb sfq tbf atm \ + cbq drr dsmark hfsc htb prio qfq ' FILTER_KIND=' basic bpf cgroup flow flower fw route rsvp tcindex u32 matchall ' ACTION_KIND=' gact mirred bpf sample ' @@ -326,6 +326,14 @@ _tc_qdisc_options() _tc_one_of_list 'dq_rate_estimator no_dq_rate_estimator' return 0 ;; + fq_pie) + _tc_once_attr 'limit flows target tupdate \ + alpha beta quantum memory_limit ecn_prob' + _tc_one_of_list 'ecn noecn' + _tc_one_of_list 'bytemode nobytemode' + _tc_one_of_list 'dq_rate_estimator no_dq_rate_estimator' + return 0 + ;; red) _tc_once_attr 'limit min max avpkt burst adaptive probability \ bandwidth ecn harddrop' diff --git a/man/man8/tc-fq_pie.8 b/man/man8/tc-fq_pie.8 new file mode 100644 index 00000000..457a56bb --- /dev/null +++ b/man/man8/tc-fq_pie.8 @@ -0,0 +1,166 @@ +.TH FQ-PIE 8 "23 January 2020" "iproute2" "Linux" + +.SH NAME + +FQ-PIE - Flow Queue Proportional Integral controller Enhanced + +.SH SYNOPSIS + +.B tc qdisc ... fq_pie +[ \fBlimit\fR PACKETS ] [ \fBflows\fR NUMBER ] +.br + \ +[ \fBtarget\fR TIME ] [ \fBtupdate\fR TIME ] +.br + \ +[ \fBalpha\fR NUMBER ] [ \fBbeta\fR NUMBER ] +.br + \ +[ \fBquantum\fR BYTES ] [ \fBmemory_limit\fR BYTES ] +.br + \ +[ \fBecn_prob\fR PERENTAGE ] [ [\fBno\fR]\fBecn\fR ] +.br + \ +[ [\fBno\fR]\fBbytemode\fR ] [ [\fBno_\fR]\fBdq_rate_estimator\fR ] + +.SH DESCRIPTION +FQ-PIE (Flow Queuing with Proportional Integral controller Enhanced) is a +queuing discipline that combines Flow Queuing with the PIE AQM scheme. FQ-PIE +uses a Jenkins hash function to classify incoming packets into different flows +and is used to provide a fair share of the bandwidth to all the flows using the +qdisc. Each such flow is managed by the PIE algorithm. + +.SH ALGORITHM +The FQ-PIE algorithm consists of two logical parts: the scheduler which selects +which queue to dequeue a packet from, and the PIE AQM which works on each of the +queues. The major work of FQ-PIE is mostly in the scheduling part. The +interaction between the scheduler and the PIE algorithm is straight forward. + +During the enqueue stage, a hashing-based scheme is used, where flows are hashed +into a number of buckets with each bucket having its own queue. The number of +buckets is configurable, and presently defaults to 1024 in the implementation. +The flow hashing is performed on the 5-tuple of source and destination IP +addresses, port numbers and IP protocol number. Once the packet has been +successfully classified into a queue, it is handed over to the PIE algorithm +for enqueuing. It is then added to the tail of the selected queue, and the +queue's byte count is updated by the packet size. If the queue is not currently +active (i.e., if it is not in either the list of new or the list of old queues) +, it is added to the end of the list of new queues, and its number of credits +is initiated to the configured quantum. Otherwise, the queue is left in its +current queue list. + +During the dequeue stage, the scheduler first looks at the list of new queues; +for the queue at the head of that list, if that queue has a negative number of +credits (i.e., it has already dequeued at least a quantum of bytes), it is given +an additional quantum of credits, the queue is put onto the end of the list of +old queues, and the routine selects the next queue and starts again. Otherwise, +that queue is selected for dequeue again. If the list of new queues is empty, +the scheduler proceeds down the list of old queues in the same fashion +(checking the credits, and either selecting the queue for dequeuing, or adding +credits and putting the queue back at the end of the list). After having +selected a queue from which to dequeue a packet, the PIE algorithm is invoked +on that queue. + +Finally, if the PIE algorithm does not return a packet, then the queue must be +empty and the scheduler does one of two things: + +If the queue selected for dequeue came from the list of new queues, it is moved +to the end of the list of old queues. If instead it came from the list of old +queues, that queue is removed from the list, to be added back (as a new queue) +the next time a packet arrives that hashes to that queue. Then (since no packet +was available for dequeue), the whole dequeue process is restarted from the +beginning. + +If, instead, the scheduler did get a packet back from the PIE algorithm, it +subtracts the size of the packet from the byte credits for the selected queue +and returns the packet as the result of the dequeue operation. + +.SH PARAMETERS +.SS limit +It is the limit on the queue size in packets. Incoming packets are dropped when +the limit is reached. The default value is 10240 packets. + +.SS flows +It is the number of flows into which the incoming packets are classified. Due +to the stochastic nature of hashing, multiple flows may end up being hashed +into the same slot. Newer flows have priority over older ones. This +parameter can be set only at load time since memory has to be allocated for +the hash table. The default value is 1024. + +.SS target +It is the queue delay which the PIE algorithm tries to maintain. The default +target delay is 15ms. + +.SS tupdate +It is the time interval at which the system drop probability is calculated. +The default is 15ms. + +.SS alpha +.SS beta +alpha and beta are parameters chosen to control the drop probability. These +should be in the range between 0 and 32. + +.SS quantum +quantum signifies the number of bytes that may be dequeued from a queue before +switching to the next queue in the deficit round robin scheme. + +.SS memory_limit +It is the maximum total memory allowed for packets of all flows. The default is +32Mb. + +.SS ecn_prob +It is the drop probability threshold below which packets will be ECN marked +instead of getting dropped. The default is 10%. Setting this parameter requires +\fBecn\fR to be enabled. + +.SS \fR[\fBno\fR]\fBecn\fR +It has the same semantics as \fBpie\fR and can be used to mark packets +instead of dropping them. If \fBecn\fR has been enabled, \fBnoecn\fR can +be used to turn it off and vice-a-versa. + +.SS \fR[\fBno\fR]\fBbytemode\fR +It is used to scale drop probability proportional to packet size +\fBbytemode\fR to turn on bytemode, \fBnobytemode\fR to turn off +bytemode. By default, \fBbytemode\fR is turned off. + +.SS \fR[\fBno_\fR]\fBdq_rate_estimator\fR +\fBdq_rate_estimator\fR can be used to calculate queue delay using Little's +Law, \fBno_dq_rate_estimator\fR can be used to calculate queue delay +using timestamp. By default, \fBdq_rate_estimator\fR is turned off. + +.SH EXAMPLES +# tc qdisc add dev eth0 root fq_pie +.br +# tc -s qdisc show dev eth0 +.br +qdisc fq_pie 8001: root refcnt 2 limit 10240p flows 1024 target 15.0ms tupdate +16.0ms alpha 2 beta 20 quantum 1514b memory_limit 32Mb ecn_prob 10 + Sent 159173586 bytes 105261 pkt (dropped 24, overlimits 0 requeues 0) + backlog 75700b 50p requeues 0 + pkts_in 105311 overlimit 0 overmemory 0 dropped 24 ecn_mark 0 + new_flow_count 7332 new_flows_len 0 old_flows_len 4 memory_used 108800 + +# tc qdisc add dev eth0 root fq_pie dq_rate_estimator +.br +# tc -s qdisc show dev eth0 +.br +qdisc fq_pie 8001: root refcnt 2 limit 10240p flows 1024 target 15.0ms tupdate +16.0ms alpha 2 beta 20 quantum 1514b memory_limit 32Mb ecn_prob 10 +dq_rate_estimator + Sent 8263620 bytes 5550 pkt (dropped 4, overlimits 0 requeues 0) + backlog 805448b 532p requeues 0 + pkts_in 6082 overlimit 0 overmemory 0 dropped 4 ecn_mark 0 + new_flow_count 94 new_flows_len 0 old_flows_len 8 memory_used 1157632 + +.SH SEE ALSO +.BR tc (8), +.BR tc-pie (8), +.BR tc-fq_codel (8) + +.SH SOURCES +RFC 8033: https://tools.ietf.org/html/rfc8033 + +.SH AUTHORS +FQ-PIE was implemented by Mohit P. Tahiliani. Please report corrections to the +Linux Networking mailing list . diff --git a/man/man8/tc.8 b/man/man8/tc.8 index 39976ad7..e8e0cd0f 100644 --- a/man/man8/tc.8 +++ b/man/man8/tc.8 @@ -284,6 +284,13 @@ bandwidth to all the flows using the queue. Each such flow is managed by the CoDel queuing discipline. Reordering within a flow is avoided since Codel internally uses a FIFO queue. .TP +fq_pie +FQ-PIE (Flow Queuing with Proportional Integral controller Enhanced) is a +queuing discipline that combines Flow Queuing with the PIE AQM scheme. FQ-PIE +uses a Jenkins hash function to classify incoming packets into different flows +and is used to provide a fair share of the bandwidth to all the flows using the +qdisc. Each such flow is managed by the PIE algorithm. +.TP gred Generalized Random Early Detection combines multiple RED queues in order to achieve multiple drop priorities. This is required to realize Assured @@ -855,6 +862,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2. .BR tc-flower (8), .BR tc-fq (8), .BR tc-fq_codel (8), +.BR tc-fq_pie (8), .BR tc-fw (8), .BR tc-hfsc (7), .BR tc-hfsc (8), diff --git a/tc/Makefile b/tc/Makefile index f06ba14b..e31cbc12 100644 --- a/tc/Makefile +++ b/tc/Makefile @@ -70,6 +70,7 @@ TCMODULES += q_codel.o TCMODULES += q_fq_codel.o TCMODULES += q_fq.o TCMODULES += q_pie.o +TCMODULES += q_fq_pie.o TCMODULES += q_cake.o TCMODULES += q_hhf.o TCMODULES += q_clsact.o diff --git a/tc/q_fq_pie.c b/tc/q_fq_pie.c new file mode 100644 index 00000000..c136cd1a --- /dev/null +++ b/tc/q_fq_pie.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Flow Queue PIE + * + * Copyright (C) 2019 Mohit P. Tahiliani + * Copyright (C) 2019 Sachin D. Patil + * Copyright (C) 2019 V. Saicharan + * Copyright (C) 2019 Mohit Bhasi + * Copyright (C) 2019 Leslie Monis + * Copyright (C) 2019 Gautam Ramakrishnan + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, + "Usage: ... fq_pie [ limit PACKETS ] [ flows NUMBER ]\n" + " [ target TIME ] [ tupdate TIME ]\n" + " [ alpha NUMBER ] [ beta NUMBER ]\n" + " [ quantum BYTES ] [ memory_limit BYTES ]\n" + " [ ecn_prob PERCENTAGE ] [ [no]ecn ]\n" + " [ [no]bytemode ] [ [no_]dq_rate_estimator ]\n"); +} + +#define ALPHA_MAX 32 +#define BETA_MAX 32 + +static int fq_pie_parse_opt(struct qdisc_util *qu, int argc, char **argv, + struct nlmsghdr *n, const char *dev) +{ + unsigned int limit = 0; + unsigned int flows = 0; + unsigned int target = 0; + unsigned int tupdate = 0; + unsigned int alpha = 0; + unsigned int beta = 0; + unsigned int quantum = 0; + unsigned int memory_limit = 0; + unsigned int ecn_prob = 0; + int ecn = -1; + int bytemode = -1; + int dq_rate_estimator = -1; + struct rtattr *tail; + + while (argc > 0) { + if (strcmp(*argv, "limit") == 0) { + NEXT_ARG(); + if (get_unsigned(&limit, *argv, 0)) { + fprintf(stderr, "Illegal \"limit\"\n"); + return -1; + } + } else if (strcmp(*argv, "flows") == 0) { + NEXT_ARG(); + if (get_unsigned(&flows, *argv, 0)) { + fprintf(stderr, "Illegal \"flows\"\n"); + return -1; + } + } else if (strcmp(*argv, "target") == 0) { + NEXT_ARG(); + if (get_time(&target, *argv)) { + fprintf(stderr, "Illegal \"target\"\n"); + return -1; + } + } else if (strcmp(*argv, "tupdate") == 0) { + NEXT_ARG(); + if (get_time(&tupdate, *argv)) { + fprintf(stderr, "Illegal \"tupdate\"\n"); + return -1; + } + } else if (strcmp(*argv, "alpha") == 0) { + NEXT_ARG(); + if (get_unsigned(&alpha, *argv, 0) || + alpha > ALPHA_MAX) { + fprintf(stderr, "Illegal \"alpha\"\n"); + return -1; + } + } else if (strcmp(*argv, "beta") == 0) { + NEXT_ARG(); + if (get_unsigned(&beta, *argv, 0) || + beta > BETA_MAX) { + fprintf(stderr, "Illegal \"beta\"\n"); + return -1; + } + } else if (strcmp(*argv, "quantum") == 0) { + NEXT_ARG(); + if (get_size(&quantum, *argv)) { + fprintf(stderr, "Illegal \"quantum\"\n"); + return -1; + } + } else if (strcmp(*argv, "memory_limit") == 0) { + NEXT_ARG(); + if (get_size(&memory_limit, *argv)) { + fprintf(stderr, "Illegal \"memory_limit\"\n"); + return -1; + } + } else if (strcmp(*argv, "ecn_prob") == 0) { + NEXT_ARG(); + if (get_unsigned(&ecn_prob, *argv, 0) || + ecn_prob >= 100) { + fprintf(stderr, "Illegal \"ecn_prob\"\n"); + return -1; + } + } else if (strcmp(*argv, "ecn") == 0) { + ecn = 1; + } else if (strcmp(*argv, "noecn") == 0) { + ecn = 0; + } else if (strcmp(*argv, "bytemode") == 0) { + bytemode = 1; + } else if (strcmp(*argv, "nobytemode") == 0) { + bytemode = 0; + } else if (strcmp(*argv, "dq_rate_estimator") == 0) { + dq_rate_estimator = 1; + } else if (strcmp(*argv, "no_dq_rate_estimator") == 0) { + dq_rate_estimator = 0; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + + argc--; + argv++; + } + + tail = addattr_nest(n, 1024, TCA_OPTIONS | NLA_F_NESTED); + if (limit) + addattr_l(n, 1024, TCA_FQ_PIE_LIMIT, &limit, sizeof(limit)); + if (flows) + addattr_l(n, 1024, TCA_FQ_PIE_FLOWS, &flows, sizeof(flows)); + if (target) + addattr_l(n, 1024, TCA_FQ_PIE_TARGET, &target, sizeof(target)); + if (tupdate) + addattr_l(n, 1024, TCA_FQ_PIE_TUPDATE, &tupdate, + sizeof(tupdate)); + if (alpha) + addattr_l(n, 1024, TCA_FQ_PIE_ALPHA, &alpha, sizeof(alpha)); + if (beta) + addattr_l(n, 1024, TCA_FQ_PIE_BETA, &beta, sizeof(beta)); + if (quantum) + addattr_l(n, 1024, TCA_FQ_PIE_QUANTUM, &quantum, + sizeof(quantum)); + if (memory_limit) + addattr_l(n, 1024, TCA_FQ_PIE_MEMORY_LIMIT, &memory_limit, + sizeof(memory_limit)); + if (ecn_prob) + addattr_l(n, 1024, TCA_FQ_PIE_ECN_PROB, &ecn_prob, + sizeof(ecn_prob)); + if (ecn != -1) + addattr_l(n, 1024, TCA_FQ_PIE_ECN, &ecn, sizeof(ecn)); + if (bytemode != -1) + addattr_l(n, 1024, TCA_FQ_PIE_BYTEMODE, &bytemode, + sizeof(bytemode)); + if (dq_rate_estimator != -1) + addattr_l(n, 1024, TCA_FQ_PIE_DQ_RATE_ESTIMATOR, + &dq_rate_estimator, sizeof(dq_rate_estimator)); + addattr_nest_end(n, tail); + + return 0; +} + +static int fq_pie_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + struct rtattr *tb[TCA_FQ_PIE_MAX + 1]; + unsigned int limit = 0; + unsigned int flows = 0; + unsigned int target = 0; + unsigned int tupdate = 0; + unsigned int alpha = 0; + unsigned int beta = 0; + unsigned int quantum = 0; + unsigned int memory_limit = 0; + unsigned int ecn_prob = 0; + int ecn = -1; + int bytemode = -1; + int dq_rate_estimator = -1; + + SPRINT_BUF(b1); + + if (opt == NULL) + return 0; + + parse_rtattr_nested(tb, TCA_FQ_PIE_MAX, opt); + + if (tb[TCA_FQ_PIE_LIMIT] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_LIMIT]) >= sizeof(__u32)) { + limit = rta_getattr_u32(tb[TCA_FQ_PIE_LIMIT]); + print_uint(PRINT_ANY, "limit", "limit %up ", limit); + } + if (tb[TCA_FQ_PIE_FLOWS] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_FLOWS]) >= sizeof(__u32)) { + flows = rta_getattr_u32(tb[TCA_FQ_PIE_FLOWS]); + print_uint(PRINT_ANY, "flows", "flows %u ", flows); + } + if (tb[TCA_FQ_PIE_TARGET] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_TARGET]) >= sizeof(__u32)) { + target = rta_getattr_u32(tb[TCA_FQ_PIE_TARGET]); + print_uint(PRINT_JSON, "target", NULL, target); + print_string(PRINT_FP, NULL, "target %s ", + sprint_time(target, b1)); + } + if (tb[TCA_FQ_PIE_TUPDATE] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_TUPDATE]) >= sizeof(__u32)) { + tupdate = rta_getattr_u32(tb[TCA_FQ_PIE_TUPDATE]); + print_uint(PRINT_JSON, "tupdate", NULL, tupdate); + print_string(PRINT_FP, NULL, "tupdate %s ", + sprint_time(tupdate, b1)); + } + if (tb[TCA_FQ_PIE_ALPHA] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_ALPHA]) >= sizeof(__u32)) { + alpha = rta_getattr_u32(tb[TCA_FQ_PIE_ALPHA]); + print_uint(PRINT_ANY, "alpha", "alpha %u ", alpha); + } + if (tb[TCA_FQ_PIE_BETA] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_BETA]) >= sizeof(__u32)) { + beta = rta_getattr_u32(tb[TCA_FQ_PIE_BETA]); + print_uint(PRINT_ANY, "beta", "beta %u ", beta); + } + if (tb[TCA_FQ_PIE_QUANTUM] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_QUANTUM]) >= sizeof(__u32)) { + quantum = rta_getattr_u32(tb[TCA_FQ_PIE_QUANTUM]); + print_uint(PRINT_JSON, "quantum", NULL, quantum); + print_string(PRINT_FP, NULL, "quantum %s ", + sprint_size(quantum, b1)); + } + if (tb[TCA_FQ_PIE_MEMORY_LIMIT] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_MEMORY_LIMIT]) >= sizeof(__u32)) { + memory_limit = rta_getattr_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]); + print_uint(PRINT_JSON, "memory_limit", NULL, memory_limit); + print_string(PRINT_FP, NULL, "memory_limit %s ", + sprint_size(memory_limit, b1)); + } + if (tb[TCA_FQ_PIE_ECN_PROB] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_ECN_PROB]) >= sizeof(__u32)) { + ecn_prob = rta_getattr_u32(tb[TCA_FQ_PIE_ECN_PROB]); + print_uint(PRINT_ANY, "ecn_prob", "ecn_prob %u ", ecn_prob); + } + if (tb[TCA_FQ_PIE_ECN] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_ECN]) >= sizeof(__u32)) { + ecn = rta_getattr_u32(tb[TCA_FQ_PIE_ECN]); + if (ecn) + print_bool(PRINT_ANY, "ecn", "ecn ", true); + } + if (tb[TCA_FQ_PIE_BYTEMODE] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_BYTEMODE]) >= sizeof(__u32)) { + bytemode = rta_getattr_u32(tb[TCA_FQ_PIE_BYTEMODE]); + if (bytemode) + print_bool(PRINT_ANY, "bytemode", "bytemode ", true); + } + if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR] && + RTA_PAYLOAD(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]) >= sizeof(__u32)) { + dq_rate_estimator = + rta_getattr_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]); + if (dq_rate_estimator) + print_bool(PRINT_ANY, "dq_rate_estimator", + "dq_rate_estimator ", true); + } + + return 0; +} + +static int fq_pie_print_xstats(struct qdisc_util *qu, FILE *f, + struct rtattr *xstats) +{ + struct tc_fq_pie_xstats _st = {}, *st; + + if (xstats == NULL) + return 0; + + st = RTA_DATA(xstats); + if (RTA_PAYLOAD(xstats) < sizeof(*st)) { + memcpy(&_st, st, RTA_PAYLOAD(xstats)); + st = &_st; + } + + print_uint(PRINT_ANY, "pkts_in", " pkts_in %u", + st->packets_in); + print_uint(PRINT_ANY, "overlimit", " overlimit %u", + st->overlimit); + print_uint(PRINT_ANY, "overmemory", " overmemory %u", + st->overmemory); + print_uint(PRINT_ANY, "dropped", " dropped %u", + st->dropped); + print_uint(PRINT_ANY, "ecn_mark", " ecn_mark %u", + st->ecn_mark); + print_nl(); + print_uint(PRINT_ANY, "new_flow_count", " new_flow_count %u", + st->new_flow_count); + print_uint(PRINT_ANY, "new_flows_len", " new_flows_len %u", + st->new_flows_len); + print_uint(PRINT_ANY, "old_flows_len", " old_flows_len %u", + st->old_flows_len); + print_uint(PRINT_ANY, "memory_used", " memory_used %u", + st->memory_usage); + + return 0; + +} + +struct qdisc_util fq_pie_qdisc_util = { + .id = "fq_pie", + .parse_qopt = fq_pie_parse_opt, + .print_qopt = fq_pie_print_opt, + .print_xstats = fq_pie_print_xstats, +}; From 0a6ea03be4fb363fe468e7e9863976e80e98a376 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 11 Feb 2020 08:16:42 -0800 Subject: [PATCH 06/14] uapi: update magic.h Signed-off-by: Stephen Hemminger --- include/uapi/linux/magic.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 3ac43637..d7806400 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -87,6 +87,7 @@ #define NSFS_MAGIC 0x6e736673 #define BPF_FS_MAGIC 0xcafe4a11 #define AAFS_MAGIC 0x5a3c69f0 +#define ZONEFS_MAGIC 0x5a4f4653 /* Since UDF 2.01 is ISO 13346 based... */ #define UDF_SUPER_MAGIC 0x15013346 From 83c543af872e38654326671f6b504257d98c4489 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 18 Feb 2020 11:50:20 +0800 Subject: [PATCH 07/14] erspan: set erspan_ver to 1 by default Commit 289763626721 ("erspan: add erspan version II support") breaks the command: # ip link add erspan1 type erspan key 1 seq erspan 123 \ local 10.1.0.2 remote 10.1.0.1 as erspan_ver is set to 0 by default, then IFLA_GRE_ERSPAN_INDEX won't be set in gre_parse_opt(). # ip -d link show erspan1 ... erspan remote 10.1.0.1 local 10.1.0.2 ... erspan_index 0 erspan_ver 1 ^^^^^^^^^^^^^^ This patch is to change to set erspan_ver to 1 by default. Fixes: 289763626721 ("erspan: add erspan version II support") Signed-off-by: Xin Long Acked-by: William Tu Signed-off-by: Stephen Hemminger --- ip/link_gre.c | 2 +- ip/link_gre6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ip/link_gre.c b/ip/link_gre.c index 15beb737..e42f21ae 100644 --- a/ip/link_gre.c +++ b/ip/link_gre.c @@ -94,7 +94,7 @@ static int gre_parse_opt(struct link_util *lu, int argc, char **argv, __u8 metadata = 0; __u32 fwmark = 0; __u32 erspan_idx = 0; - __u8 erspan_ver = 0; + __u8 erspan_ver = 1; __u8 erspan_dir = 0; __u16 erspan_hwid = 0; diff --git a/ip/link_gre6.c b/ip/link_gre6.c index 9d1741bf..94a4ee70 100644 --- a/ip/link_gre6.c +++ b/ip/link_gre6.c @@ -106,7 +106,7 @@ static int gre_parse_opt(struct link_util *lu, int argc, char **argv, __u8 metadata = 0; __u32 fwmark = 0; __u32 erspan_idx = 0; - __u8 erspan_ver = 0; + __u8 erspan_ver = 1; __u8 erspan_dir = 0; __u16 erspan_hwid = 0; From 2c7056ac26412fe99443a283f0c1261cb81ccea2 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Mon, 17 Feb 2020 14:46:18 +0100 Subject: [PATCH 08/14] nstat: print useful error messages in abort() cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When nstat temporary file is corrupted or in some other corner cases, nstat use abort() to stop its execution. This can puzzle some users, wondering what is the reason for the crash. This commit replaces abort() with some meaningful error messages and exit() Reported-by: Renaud Métrich Signed-off-by: Andrea Claudi Signed-off-by: Stephen Hemminger --- misc/nstat.c | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/misc/nstat.c b/misc/nstat.c index 23113b22..425e75ef 100644 --- a/misc/nstat.c +++ b/misc/nstat.c @@ -142,14 +142,19 @@ static void load_good_table(FILE *fp) } /* idbuf is as big as buf, so this is safe */ nr = sscanf(buf, "%s%llu%lg", idbuf, &val, &rate); - if (nr < 2) - abort(); + if (nr < 2) { + fprintf(stderr, "%s:%d: error parsing history file\n", + __FILE__, __LINE__); + exit(-2); + } if (nr < 3) rate = 0; if (useless_number(idbuf)) continue; - if ((n = malloc(sizeof(*n))) == NULL) - abort(); + if ((n = malloc(sizeof(*n))) == NULL) { + perror("nstat: malloc"); + exit(-1); + } n->id = strdup(idbuf); n->val = val; n->rate = rate; @@ -190,8 +195,11 @@ static void load_ugly_table(FILE *fp) int count1, count2, skip = 0; p = strchr(buf, ':'); - if (!p) - abort(); + if (!p) { + fprintf(stderr, "%s:%d: error parsing history file\n", + __FILE__, __LINE__); + exit(-2); + } count1 = count_spaces(buf); *p = 0; idbuf[0] = 0; @@ -211,8 +219,10 @@ static void load_ugly_table(FILE *fp) strncat(idbuf, p, sizeof(idbuf) - off - 1); } n = malloc(sizeof(*n)); - if (!n) - abort(); + if (!n) { + perror("nstat: malloc"); + exit(-1); + } n->id = strdup(idbuf); n->rate = 0; n->next = db; @@ -221,18 +231,27 @@ static void load_ugly_table(FILE *fp) } n = db; nread = getline(&buf, &buflen, fp); - if (nread == -1) - abort(); + if (nread == -1) { + fprintf(stderr, "%s:%d: error parsing history file\n", + __FILE__, __LINE__); + exit(-2); + } count2 = count_spaces(buf); if (count2 > count1) skip = count2 - count1; do { p = strrchr(buf, ' '); - if (!p) - abort(); + if (!p) { + fprintf(stderr, "%s:%d: error parsing history file\n", + __FILE__, __LINE__); + exit(-2); + } *p = 0; - if (sscanf(p+1, "%llu", &n->val) != 1) - abort(); + if (sscanf(p+1, "%llu", &n->val) != 1) { + fprintf(stderr, "%s:%d: error parsing history file\n", + __FILE__, __LINE__); + exit(-2); + } /* Trick to skip "dummy" trailing ICMP MIB in 2.4 */ if (skip) skip--; From 320c5c6e0951f8ac7a06659d83abcec6c486df7d Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Tue, 25 Feb 2020 08:12:13 -0500 Subject: [PATCH 09/14] ip route: Do not imply pref and ttl-propagate are per nexthop Currently `ip -6 route show` gives us this output: sharpd@eva ~/i/ip (master)> ip -6 route show ::1 dev lo proto kernel metric 256 pref medium 4:5::6:7 nhid 18 proto static metric 20 nexthop via fe80::99 dev enp39s0 weight 1 nexthop via fe80::44 dev enp39s0 weight 1 pref medium Displaying `pref medium` as the last bit of output implies that the RTA_PREF is a per nexthop value, when it is infact a per route piece of data. Change the output to display RTA_PREF and RTA_TTL_PROPAGATE before the RTA_MULTIPATH data is shown: sharpd@eva ~/i/ip (master)> ./ip -6 route show ::1 dev lo proto kernel metric 256 pref medium 4:5::6:7 nhid 18 proto static metric 20 pref medium nexthop via fe80::99 dev enp39s0 weight 1 nexthop via fe80::44 dev enp39s0 weight 1 Signed-off-by: Donald Sharp Reviewed-by: Andrea Claudi Acked-by: Roopa Prabhu Signed-off-by: Stephen Hemminger --- ip/iproute.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ip/iproute.c b/ip/iproute.c index 93b805c9..07c45169 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -933,9 +933,6 @@ int print_route(struct nlmsghdr *n, void *arg) if (tb[RTA_IIF] && filter.iifmask != -1) print_rta_if(fp, tb[RTA_IIF], "iif"); - if (tb[RTA_MULTIPATH]) - print_rta_multipath(fp, r, tb[RTA_MULTIPATH]); - if (tb[RTA_PREF]) print_rt_pref(fp, rta_getattr_u8(tb[RTA_PREF])); @@ -951,6 +948,14 @@ int print_route(struct nlmsghdr *n, void *arg) propagate ? "enabled" : "disabled"); } + if (tb[RTA_MULTIPATH]) + print_rta_multipath(fp, r, tb[RTA_MULTIPATH]); + + /* If you are adding new route RTA_XXXX then place it above + * the RTA_MULTIPATH else it will appear that the last nexthop + * in the ECMP has new attributes + */ + print_string(PRINT_FP, NULL, "\n", NULL); close_json_object(); fflush(fp); From 229bb886a3c4444521eca16c7ab74a539aaf9cb4 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Thu, 27 Feb 2020 17:45:43 +0100 Subject: [PATCH 10/14] man: ip.8: Add missing vrf subcommand description Add description to the vrf subcommand and a reference to the dedicated man page. Signed-off-by: Andrea Claudi Reviewed-by: David Ahern Signed-off-by: Stephen Hemminger --- man/man8/ip.8 | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/man/man8/ip.8 b/man/man8/ip.8 index 1661aa67..1613f790 100644 --- a/man/man8/ip.8 +++ b/man/man8/ip.8 @@ -22,7 +22,7 @@ ip \- show / manipulate routing, network devices, interfaces and tunnels .BR link " | " address " | " addrlabel " | " route " | " rule " | " neigh " | "\ ntable " | " tunnel " | " tuntap " | " maddress " | " mroute " | " mrule " | "\ monitor " | " xfrm " | " netns " | " l2tp " | " tcp_metrics " | " token " | "\ - macsec " }" + macsec " | " vrf " }" .sp .ti -8 @@ -312,6 +312,10 @@ readability. .B tuntap - manage TUN/TAP devices. +.TP +.B vrf +- manage virtual routing and forwarding devices. + .TP .B xfrm - manage IPSec policies. @@ -410,6 +414,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2. .BR ip-tcp_metrics (8), .BR ip-token (8), .BR ip-tunnel (8), +.BR ip-vrf (8), .BR ip-xfrm (8) .br .RB "IP Command reference " ip-cref.ps From f9d696cf414c2c475764aa3b29cf288350f1e21f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 24 Feb 2020 09:57:01 -0500 Subject: [PATCH 11/14] xfrm: not try to delete ipcomp states when using deleteall In kernel space, ipcomp(sub) states used by main states are not allowed to be deleted by users, they would be freed only when all main states are destroyed and no one uses them. In user space, ip xfrm sta deleteall doesn't filter these ipcomp states out, and it causes errors: # ip xfrm state add src 192.168.0.1 dst 192.168.0.2 spi 0x1000 \ proto comp comp deflate mode tunnel sel src 192.168.0.1 dst \ 192.168.0.2 proto gre # ip xfrm sta deleteall Failed to send delete-all request : Operation not permitted This patch is to fix it by filtering ipcomp states with a check xsinfo->id.proto == IPPROTO_IPIP. Fixes: c7699875bee0 ("Import patch ipxfrm-20040707_2.diff") Signed-off-by: Xin Long Signed-off-by: Stephen Hemminger --- ip/xfrm_state.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ip/xfrm_state.c b/ip/xfrm_state.c index 16ff1931..d68f600a 100644 --- a/ip/xfrm_state.c +++ b/ip/xfrm_state.c @@ -1131,6 +1131,9 @@ static int xfrm_state_keep(struct nlmsghdr *n, void *arg) if (!xfrm_state_filter_match(xsinfo)) return 0; + if (xsinfo->id.proto == IPPROTO_IPIP) + return 0; + if (xb->offset > xb->size) { fprintf(stderr, "State buffer overflow\n"); return -1; From 8f1c9d4a3c0d4e720026b942c922372b3c12e110 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Fri, 28 Feb 2020 18:36:24 +0100 Subject: [PATCH 12/14] man: rdma.8: Add missing resource subcommand description Add resource subcommand in the OBJECT section and a short description for it. Reported-by: Zhaojuan Guo Signed-off-by: Andrea Claudi Signed-off-by: Stephen Hemminger --- man/man8/rdma.8 | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/man/man8/rdma.8 b/man/man8/rdma.8 index ef29b1c6..221bf334 100644 --- a/man/man8/rdma.8 +++ b/man/man8/rdma.8 @@ -19,7 +19,7 @@ rdma \- RDMA tool .ti -8 .IR OBJECT " := { " -.BR dev " | " link " | " system " | " statistic " }" +.BR dev " | " link " | " resource " | " system " | " statistic " }" .sp .ti -8 @@ -70,6 +70,10 @@ Generate JSON output. .B link - RDMA port related. +.TP +.B resource +- RDMA resource configuration. + .TP .B sys - RDMA subsystem related. From 31824e2299bf5dc609026436db629b0c25cc1a10 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Fri, 28 Feb 2020 18:36:25 +0100 Subject: [PATCH 13/14] man: rdma-statistic: Add filter description Add description for filters on rdma statistics show command. Also add a filter description on the help message of the command. Additionally, fix some whitespace issue in the man page. Reported-by: Zhaojuan Guo Signed-off-by: Andrea Claudi Signed-off-by: Stephen Hemminger --- man/man8/rdma-statistic.8 | 16 ++++++++++++---- rdma/stat.c | 1 + 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/man/man8/rdma-statistic.8 b/man/man8/rdma-statistic.8 index e3f4b51b..7de495c9 100644 --- a/man/man8/rdma-statistic.8 +++ b/man/man8/rdma-statistic.8 @@ -9,7 +9,7 @@ rdma-statistic \- RDMA statistic counter configuration .B rdma .RI "[ " OPTIONS " ]" .B statistic -.RI " { " COMMAND " | " +.RI "{ " COMMAND " | " .BR help " }" .sp @@ -23,6 +23,7 @@ rdma-statistic \- RDMA statistic counter configuration .RI "[ " OBJECT " ]" .B show link .RI "[ " DEV/PORT_INDX " ]" +.RI "[ " FILTER_NAME " " FILTER_VALUE " ]" .ti -8 .B rdma statistic @@ -34,7 +35,7 @@ rdma-statistic \- RDMA statistic counter configuration .IR OBJECT .B set .IR COUNTER_SCOPE -.RI "[ " DEV/PORT_INDEX "]" +.RI "[ " DEV/PORT_INDEX " ]" .B auto .RI "{ " CRITERIA " | " .BR off " }" @@ -44,7 +45,7 @@ rdma-statistic \- RDMA statistic counter configuration .IR OBJECT .B bind .IR COUNTER_SCOPE -.RI "[ " DEV/PORT_INDEX "]" +.RI "[ " DEV/PORT_INDEX " ]" .RI "[ " OBJECT-ID " ]" .RI "[ " COUNTER-ID " ]" @@ -53,7 +54,7 @@ rdma-statistic \- RDMA statistic counter configuration .IR OBJECT .B unbind .IR COUNTER_SCOPE -.RI "[ " DEV/PORT_INDEX "]" +.RI "[ " DEV/PORT_INDEX " ]" .RI "[ " COUNTER-ID " ]" .RI "[ " OBJECT-ID " ]" @@ -69,6 +70,10 @@ rdma-statistic \- RDMA statistic counter configuration .IR CRITERIA " := " .RB "{ " type " }" +.ti -8 +.IR FILTER_NAME " := " +.RB "{ " cntn " | " lqpn " | " pid " }" + .SH "DESCRIPTION" .SS rdma statistic [object] show - Queries the specified RDMA device for RDMA and driver-specific statistics. Show the default hw counters if object is not specified @@ -79,6 +84,9 @@ rdma-statistic \- RDMA statistic counter configuration .I "PORT_INDEX" - specifies counters on this RDMA port to show. +.I "FILTER_NAME +- specifies a filter to show only the results matching it. + .SS rdma statistic set - configure counter statistic auto-mode for a specific device/port In auto mode all objects belong to one category are bind automatically to a single counter set. Not applicable for MR's. diff --git a/rdma/stat.c b/rdma/stat.c index 2f575287..8d4b7a11 100644 --- a/rdma/stat.c +++ b/rdma/stat.c @@ -23,6 +23,7 @@ static int stat_help(struct rd *rd) pr_out("where OBJECT: = { qp }\n"); pr_out(" CRITERIA : = { type }\n"); pr_out(" COUNTER_SCOPE: = { link | dev }\n"); + pr_out(" FILTER_NAME: = { cntn | lqpn | pid }\n"); pr_out("Examples:\n"); pr_out(" %s statistic qp show\n", rd->filename); pr_out(" %s statistic qp show link mlx5_2/1\n", rd->filename); From b5a77cf70116f4c5c1767f0e0ab78f7ff2f58bca Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 28 Feb 2020 13:55:38 -0800 Subject: [PATCH 14/14] uapi: update bpf.h Updated upstream Signed-off-by: Stephen Hemminger --- include/uapi/linux/bpf.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a8701bf..65764580 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1045,9 +1045,9 @@ union bpf_attr { * supports redirection to the egress interface, and accepts no * flag at all. * - * The same effect can be attained with the more generic - * **bpf_redirect_map**\ (), which requires specific maps to be - * used but offers better performance. + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. * Return * For XDP, the helper returns **XDP_REDIRECT** on success or * **XDP_ABORTED** on error. For other program types, the values @@ -1611,13 +1611,11 @@ union bpf_attr { * the caller. Any higher bits in the *flags* argument must be * unset. * - * When used to redirect packets to net devices, this helper - * provides a high performance increase over **bpf_redirect**\ (). - * This is due to various implementation details of the underlying - * mechanisms, one of which is the fact that **bpf_redirect_map**\ - * () tries to send packet as a "bulk" to the device. + * See also bpf_redirect(), which only supports redirecting to an + * ifindex, but doesn't require a map to do so. * Return - * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the **flags* argument on error. * * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description