From 4e39bfb93a94098601699ea3f5de31c02cda1bf4 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 23 Sep 2015 16:18:34 -0700 Subject: [PATCH 001/151] update kernel headers to 4.3 net-next --- include/linux/bpf.h | 9 +++++++++ include/linux/if_ether.h | 1 + include/linux/pkt_cls.h | 4 ++++ include/linux/rtnetlink.h | 2 ++ include/linux/tc_act/tc_skbedit.h | 3 ++- 5 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3d6d00b7..ae08b735 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -272,6 +272,14 @@ enum bpf_func_id { BPF_FUNC_skb_get_tunnel_key, BPF_FUNC_skb_set_tunnel_key, BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, index) */ + /** + * bpf_redirect(ifindex, flags) - redirect to another netdev + * @ifindex: ifindex of the net device + * @flags: bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * Return: TC_ACT_REDIRECT + */ + BPF_FUNC_redirect, __BPF_FUNC_MAX_ID, }; @@ -293,6 +301,7 @@ struct __sk_buff { __u32 tc_index; __u32 cb[5]; __u32 hash; + __u32 tc_classid; }; struct bpf_tunnel_key { diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index 4678e499..bf278d65 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -42,6 +42,7 @@ #define ETH_P_LOOP 0x0060 /* Ethernet Loopback packet */ #define ETH_P_PUP 0x0200 /* Xerox PUP packet */ #define ETH_P_PUPAT 0x0201 /* Xerox PUP Addr Trans packet */ +#define ETH_P_TSN 0x22F0 /* TSN (IEEE 1722) packet */ #define ETH_P_IP 0x0800 /* Internet Protocol packet */ #define ETH_P_X25 0x0805 /* CCITT X.25 */ #define ETH_P_ARP 0x0806 /* Address Resolution packet */ diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h index 25af89fa..a323146e 100644 --- a/include/linux/pkt_cls.h +++ b/include/linux/pkt_cls.h @@ -33,6 +33,7 @@ enum { #define TC_ACT_STOLEN 4 #define TC_ACT_QUEUED 5 #define TC_ACT_REPEAT 6 +#define TC_ACT_REDIRECT 7 #define TC_ACT_JUMP 0x10000000 /* Action type identifiers*/ @@ -319,6 +320,8 @@ enum { /* BPF classifier */ +#define TCA_BPF_FLAG_ACT_DIRECT (1 << 0) + enum { TCA_BPF_UNSPEC, TCA_BPF_ACT, @@ -328,6 +331,7 @@ enum { TCA_BPF_OPS, TCA_BPF_FD, TCA_BPF_NAME, + TCA_BPF_FLAGS, __TCA_BPF_MAX, }; diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 3fe10b05..10452e07 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -270,6 +270,7 @@ enum rt_scope_t { #define RTM_F_CLONED 0x200 /* This route is cloned */ #define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */ #define RTM_F_PREFIX 0x800 /* Prefix addresses */ +#define RTM_F_LOOKUP_TABLE 0x1000 /* set rtm_table to FIB lookup result */ /* Reserved table identifiers */ @@ -664,6 +665,7 @@ struct tcamsg { #define RTEXT_FILTER_VF (1 << 0) #define RTEXT_FILTER_BRVLAN (1 << 1) #define RTEXT_FILTER_BRVLAN_COMPRESSED (1 << 2) +#define RTEXT_FILTER_SKIP_STATS (1 << 3) /* End of information exported to user level */ diff --git a/include/linux/tc_act/tc_skbedit.h b/include/linux/tc_act/tc_skbedit.h index 73026b35..7a2e910a 100644 --- a/include/linux/tc_act/tc_skbedit.h +++ b/include/linux/tc_act/tc_skbedit.h @@ -11,7 +11,8 @@ * more details. * * You should have received a copy of the GNU General Public License along with - * this program; if not, see . + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. * * Author: Alexander Duyck */ From bc234301af1290a94659b2f63ea99277d565448c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 21 Sep 2015 11:19:48 -0700 Subject: [PATCH 002/151] ip route: Add RTM_F_LOOKUP_TABLE flag and show table id Currently 'ip route get' does not show the table the lookup result comes from and prior to kernel commit c36ba6603a11 the response from the kernel was hardcoded to the main table. From the discussion this appears to be a leftover from the route cache where the cached entry lost the table id and so the result was hardcoded to main table. c36ba6603a11 added the RTM_F_LOOKUP_TABLE flag to maintain that behavior but to allow new tools to ask for the actual table id for the lookup. This patch adds that flag to ip route get request and if the result is not the main table shows the table id. Signed-off-by: David Ahern --- ip/iproute.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ip/iproute.c b/ip/iproute.c index da25548c..b0cd299e 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -424,9 +424,9 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) if (tb[RTA_OIF] && filter.oifmask != -1) fprintf(fp, "dev %s ", ll_index_to_name(*(int*)RTA_DATA(tb[RTA_OIF]))); + if (table && (table != RT_TABLE_MAIN || show_details > 0) && !filter.tb) + fprintf(fp, " table %s ", rtnl_rttable_n2a(table, b1, sizeof(b1))); if (!(r->rtm_flags&RTM_F_CLONED)) { - if ((table != RT_TABLE_MAIN || show_details > 0) && !filter.tb) - fprintf(fp, " table %s ", rtnl_rttable_n2a(table, b1, sizeof(b1))); if ((r->rtm_protocol != RTPROT_BOOT || show_details > 0) && filter.protocolmask != -1) fprintf(fp, " proto %s ", rtnl_rtprot_n2a(r->rtm_protocol, b1, sizeof(b1))); if ((r->rtm_scope != RT_SCOPE_UNIVERSE || show_details > 0) && filter.scopemask != -1) @@ -1642,6 +1642,8 @@ static int iproute_get(int argc, char **argv) if (req.r.rtm_family == AF_UNSPEC) req.r.rtm_family = AF_INET; + req.r.rtm_flags |= RTM_F_LOOKUP_TABLE; + if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0) exit(2); From dcd8d142d2540f29230420972c574dbeac0bf1e9 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Fri, 2 Oct 2015 10:15:21 +0200 Subject: [PATCH 003/151] tipc: add man pages This patch adds man pages for the TIPC tool. There is one main page and one page for each top level sub-command. These pages mainly aims to help a user of the tipc tool. In addition to this they describe a bit about what TIPC is and some of its features as a protocol. Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy --- man/man8/tipc-bearer.8 | 230 ++++++++++++++++++++++++++++++++++++++ man/man8/tipc-link.8 | 225 +++++++++++++++++++++++++++++++++++++ man/man8/tipc-media.8 | 86 ++++++++++++++ man/man8/tipc-nametable.8 | 99 ++++++++++++++++ man/man8/tipc-node.8 | 71 ++++++++++++ man/man8/tipc-socket.8 | 59 ++++++++++ man/man8/tipc.8 | 99 ++++++++++++++++ 7 files changed, 869 insertions(+) create mode 100644 man/man8/tipc-bearer.8 create mode 100644 man/man8/tipc-link.8 create mode 100644 man/man8/tipc-media.8 create mode 100644 man/man8/tipc-nametable.8 create mode 100644 man/man8/tipc-node.8 create mode 100644 man/man8/tipc-socket.8 create mode 100644 man/man8/tipc.8 diff --git a/man/man8/tipc-bearer.8 b/man/man8/tipc-bearer.8 new file mode 100644 index 00000000..f59c39d2 --- /dev/null +++ b/man/man8/tipc-bearer.8 @@ -0,0 +1,230 @@ +.TH TIPC-BEARER 8 "02 Jun 2015" "iproute2" "Linux" + +./ For consistency, please keep padding right aligned. +./ For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-bearer \- show or modify TIPC bearers + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 +.B tipc bearer enable +.RB "[ " domain +.IR DOMAIN " ]" +.RB "[ " priority +.IR PRIORITY " ]" +.BR media +.br +.RB "{ { " eth " | " ib " } " device +.IR "DEVICE" " }" +.RB "|" +.br +.RB "{ " udp +.B name +.IR NAME +.B localip +.IR LOCALIP +.RB "[ " localport +.IR LOCALPORT " ]" +.RB "[ " remoteip +.IR REMOTEIP " ]" +.RB "[ " remoteport +.IR REMOTEPORT " ] }" +.br + +.ti -8 +.B tipc bearer disable media +.br +.RB "{ { " eth " | " ib " } " device +.IR DEVICE +.RB "|" +.br +.RB "{ " udp +.B name +.IR NAME +.B localip +.IR LOCALIP " } }" +.br + +.ti -8 +.B tipc bearer set +.RB "{ " "priority " +.IR PRIORITY +.RB "| " tolerance +.IR TOLERANCE +.RB "| " window +.IR WINDOW +.RB "} " media +.br +.RB "{ { " eth " | " ib " } " device +.IR "DEVICE" " }" +.RB "|" +.br +.RB "{ " udp +.B name +.IR NAME +.B localip +.IR LOCALIP " } }" +.br + +.ti -8 +.B tipc bearer get +.RB "{ " "priority" " | " tolerance " | " window " } " media +.br +.RB "{ { " eth " | " ib " } " device +.IR "DEVICE" " }" +.RB "|" +.br +.RB "{ " udp +.B name +.IR NAME +.B localip +.IR LOCALIP " } }" +.br + +.ti -8 +.B tipc bearer list +.br + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc bearer --help +will show bearer help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. +.SH DESCRIPTION + +.SS Bearer identification +.TP +.BI "media " MEDIA +.br +Specifies the TIPC media type for a particular bearer to operate on. +Different media types have different ways of identifying a unique bearer. +For example, +.BR "ib " "and " eth +identify a bearer with a +.I DEVICE +while +.B udp +identify a bearer with a +.IR "LOCALIP " "and a " NAME + +.B ib +- Infiniband +.sp +.B eth +- Ethernet +.sp +.B udp +- User Datagram Protocol (UDP) +.sp + +.TP +.BI "name " NAME +.br +Logical bearer identifier valid for bearers on +.B udp +media. + +.TP +.BI "device " DEVICE +.br +Physical bearer device valid for bearers on +.B eth +and +.B ib +media. + +.SS Bearer properties + +.TP +.B domain +.br +The addressing domain (region) in which a bearer will establish links and accept +link establish requests. + +.TP +.B priority +.br +Default link priority inherited by all links subsequently established over a +bearer. A single bearer can only host one link to a particular node. This means +the default link priority for a bearer typically affects which bearer to use +when communicating with a particular node in an multi bearer setup. For more +info about link priority see +.BR tipc-link (8) + +.TP +.B tolerance +.br +Default link tolerance inherited by all links subsequently established over a +bearer. For more info about link tolerance see +.BR tipc-link (8) + +.TP +.B window +.br +Default link window inherited by all links subsequently established over a +bearer. For more info about the link window size see +.BR tipc-link (8) + +.SS UDP bearer options + +.TP +.BI "localip " LOCALIP +.br +Specify a local IP v4/v6 address for a +.B udp +bearer. + +.TP +.BI "localport " LOCALPORT +.br +Specify the local port for a +.B udp +bearer. The default port 6118 is used if no port is specified. + +.TP +.BI "remoteip " REMOTEIP +.br +Specify a remote IP for a +.B udp +bearer. If no remote IP is specified a +.B udp +bearer runs in multicast mode and tries to auto-discover its neighbours. +The multicast IP address is generated based on the TIPC network ID. If a remote +IP is specified the +.B udp +bearer runs in point-to-point mode. + +.TP +.BI "remoteport " REMOTEPORT +.br +Specify the remote port for a +.B udp +bearer. The default port 6118 is used if no port is specified. + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe diff --git a/man/man8/tipc-link.8 b/man/man8/tipc-link.8 new file mode 100644 index 00000000..899b8825 --- /dev/null +++ b/man/man8/tipc-link.8 @@ -0,0 +1,225 @@ +.TH TIPC-LINK 8 "02 Jun 2015" "iproute2" "Linux" + +./ For consistency, please keep padding right aligned. +./ For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-link \- show links or modify link properties + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 + +.ti -8 +.B tipc link set +.RB "{ " "priority " +.IR PRIORITY +.RB "| " tolerance +.IR TOLERANCE +.RB "| " window +.IR "WINDOW " } +.BI "link " LINK + +.ti -8 +.B tipc link get +.RB "{ " "priority" " | " tolerance " | " window " } " link +.I LINK + +.ti -8 +.B tipc link statistics +.RB "{ " "show " "[ " link +.I LINK +.RB "] | " "reset +.BI "link " "LINK " +.R } + +.ti -8 +.B tipc link list +.br + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc link --help +will show link help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. +.SH DESCRIPTION + +.SS Link statistics + +.TP +.BR "ACTIVE " "link state" +.br +An +.B ACTIVE +link is serving traffic. Two links to the same node can become +.B ACTIVE +if they have the same link +.BR priority . +If there is more than two links with the same priority the additional links will +be put in +.B STANDBY +state. + +.TP +.BR "STANDBY " "link state" +.br +A +.B STANDBY +link has lower link priority than an +.B ACTIVE +link. A +.B STANDBY +link has control traffic flowing and is ready to take over should the +.B ACTIVE +link(s) go down. + +.TP +.B MTU +.br +The Maximum Transmission Unit. The two endpoints advertise their default or +configured +.B MTU +at initial link setup and will agree to use the lower of the two values should +they differ. + +.TP +.B Packets +.br +The total amount of transmitted or received TIPC packets on a link. Including +.BR "fragmented " "and " "bundled " packets. + +.TP +.B Fragments +.br +Represented in the form +.BR fragments / fragmented . +Where +.B fragmented +is the amount of data messages which have been broken into +.BR fragments . +Subsequently the +.B fragments +are the total amount of packets that the +.B fragmented +messages has been broken into. + +.TP +.B Bundles +.br +Represented in the form +.BR bundles / bundled . +If a link becomes congested the link will attempt to bundle data from small +.B bundled +packets into +.B bundles +of full MTU size packets before they are transmitted. + +.TP +.B Profile +.br +Shows the +.B average +packet size in octets/bytes for a +.B sample +of packets. It also shows the packet size distribution of the +.B sampled +packets in the intervals + +0-64 bytes +.br +64-256 bytes +.br +256-1024 bytes +.br +1024-4096 bytes +.br +4096-16384 bytes +.br +16384-32768 bytes +.br +32768-66000 bytes + +.TP +.B Message counters + +.B states +- Number of link state messages +.sp + +.B probes +- Link state messages with probe flag set. Typically sent when a link is idle +.sp + +.B nacks +- Number of negative acknowledgement (NACK) packets sent and received by the +link +.sp + +.B defs +- Number of packets received out of order +.sp + +.B dups +- Number of duplicate packets received + +.TP +.B Congestion link +The number of times an application has tried to send data when the TIPC link +was congested + +.TP +.B Send queue +.B Max +is the maximum amount of messages that has resided in the out queue during the +statistics collection period of a link. + +.B Avg +is the average outqueue size during the lifetime of a link. + +.SS Link properties + +.TP +.B priority +.br +The priority between logical TIPC links to a particular node. Link priority can +range from 0 (lowest) to 31 (highest). + +.TP +.B tolerance +.br +Link tolerance specifies the maximum time in milliseconds that TIPC will allow +a communication problem to exist before taking the link down. The default value +is 1500 milliseconds. + +.TP +.B window +.br +The link window controls how many unacknowledged messages a link endpoint can +have in its transmit queue before TIPC's congestion control mechanism is +activated. + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-media (8), +.BR tipc-bearer (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe diff --git a/man/man8/tipc-media.8 b/man/man8/tipc-media.8 new file mode 100644 index 00000000..7f94efec --- /dev/null +++ b/man/man8/tipc-media.8 @@ -0,0 +1,86 @@ +.TH TIPC-MEDIA 8 "02 Jun 2015" "iproute2" "Linux" + +./ For consistency, please keep padding right aligned. +./ For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-media \- list or modify media properties + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 + +.ti -8 +.B tipc media set +.RB "{ " "priority " +.IR PRIORITY +.RB "| " tolerance +.IR TOLERANCE +.RB "| " window +.IR "WINDOW " } +.BI "media " MEDIA + +.ti -8 +.B tipc media get +.RB "{ " "priority" " | " tolerance " | " window " } " media +.I MEDIA + +.ti -8 +.B tipc media list +.br + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc media --help +will show media help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. +.SH DESCRIPTION + +.SS Media properties + +.TP +.B priority +.br +Default link priority inherited by all bearers subsequently enabled on a +media. For more info about link priority see +.BR tipc-link (8) + +.TP +.B tolerance +.br +Default link tolerance inherited by all bearers subsequently enabled on a +media. For more info about link tolerance see +.BR tipc-link (8) + +.TP +.B window +.br +Default link window inherited by all bearers subsequently enabled on a +media. For more info about link window see +.BR tipc-link (8) + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-bearer (8), +.BR tipc-link (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe diff --git a/man/man8/tipc-nametable.8 b/man/man8/tipc-nametable.8 new file mode 100644 index 00000000..c8d573f3 --- /dev/null +++ b/man/man8/tipc-nametable.8 @@ -0,0 +1,99 @@ +.TH TIPC-NAMETABLE 8 "02 Jun 2015" "iproute2" "Linux" + +./ For consistency, please keep padding right aligned. +./ For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-nametable \- show TIPC nametable + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 +.B tipc nametable show +.br + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc nametable --help +will show nametable help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. + +.SH DESCRIPTION +The nametable shows TIPC publication information. + +.SS Nametable format + +.TP +.B Type +.br +The 32-bit type field of the port name. The type field often indicates the class of service +provided by a port. + +.TP +.B Lower +.br +The lower bound of the 32-bit instance field of the port name. +The instance field is often used as as a sub-class indicator. + +.TP +.B Upper +.br +The upper bound of the 32-bit instance field of the port name. +The instance field is often used as as a sub-class indicator. +A difference in +.BR "lower " "and " upper +means the socket is bound to the port name range [lower,upper] + +.TP +.B Port Identity +.br +The unique socket (port) identifier within the TIPC cluster. The +.B port identity +consists of a node identity followed by a socket reference number. + +.TP +.B Publication +.br +The +.B publication +ID is a random number used internally to represent a publication. + +.TP +.B Scope +.br +The publication +.B scope +specifies the visibility of a bound port name. +The +.B scope +can be specified to comprise three different domains: +.BR node ", " "cluster " "and " zone. +Applications residing within the specified +.B scope +can see and access the port using the displayed port name. + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-bearer (8), +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-node (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe diff --git a/man/man8/tipc-node.8 b/man/man8/tipc-node.8 new file mode 100644 index 00000000..66418b35 --- /dev/null +++ b/man/man8/tipc-node.8 @@ -0,0 +1,71 @@ +.TH TIPC-NODE 8 "02 Jun 2015" "iproute2" "Linux" + +./ For consistency, please keep padding right aligned. +./ For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-node \- modify and show local node parameters or list peer nodes + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 +.B tipc node set +.RB "{ " "address " +.IR ADDRESS +.RB "| " netid +.IR NETID +.RB "} " + +.ti -8 +.B tipc node get +.RB "{ " "address" " | " netid " } " + +.ti -8 +.B tipc node list +.br + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc node --help +will show node help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. +.SH DESCRIPTION + +.SS Node parameters +.TP +.BI address +.br +The TIPC logical address. On the form x.y.z where x, y and z are unsigned +integers. + +.TP +.BI netid +.br +Network identity. Can by used to create individual TIPC clusters on the same +media. + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-bearer (8), +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-nametable (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe diff --git a/man/man8/tipc-socket.8 b/man/man8/tipc-socket.8 new file mode 100644 index 00000000..af18e35b --- /dev/null +++ b/man/man8/tipc-socket.8 @@ -0,0 +1,59 @@ +.TH TIPC-SOCKET 8 "02 Jun 2015" "iproute2" "Linux" + +./ For consistency, please keep padding right aligned. +./ For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-socket \- show TIPC socket (port) information + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 +.B tipc socket list + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc socket --help +will show socket help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. + +.SH DESCRIPTION +A TIPC socket is represented by an unsigned integer. + +.TP +.SS Bound state +A bound socket has a logical TIPC port name associated with it. + +.TP +.SS Connected state +A connected socket is directly connected to another socket creating a point +to point connection between TIPC sockets. If the connection to X was made using +a logical port name Y that name will show up as +.BR "connected to " "X " "via " Y +. + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-bearer (8) +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe diff --git a/man/man8/tipc.8 b/man/man8/tipc.8 new file mode 100644 index 00000000..c1165523 --- /dev/null +++ b/man/man8/tipc.8 @@ -0,0 +1,99 @@ +.TH TIPC 8 "02 Jun 2015" "iproute2" "Linux" +.SH NAME +tipc \- a TIPC configuration and management tool +.SH SYNOPSIS + +.ad l +.in +8 +.ti -8 +.B tipc +.RI "[ " OPTIONS " ] " COMMAND " " ARGUMENTS " +.sp + +.ti -8 +.IR COMMAND " := { " +.BR bearer " | " link " | " media " | " nametable " | " node " | " socket " } +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-h\fR[\fIhelp\fR] } + +.SH DESCRIPTION +The Transparent Inter-Process Communication (TIPC) protocol offers total address +transparency between processes which allows applications in a clustered computer +environment to communicate quickly and reliably with each other, regardless of +their location within the cluster. + +TIPC originated at the telecommunications manufacturer Ericsson. The first open +source version of TIPC was created in 2000 when Ericsson released its first +Linux version of TIPC. TIPC was introduced in the mainline Linux kernel in 2006 +and is now widely used both within and outside of Ericsson. + +.SH OPTIONS + +.TP +.BR "\-h" , " --help" +Show help about last given command. For example +.B tipc bearer --help +will show bearer help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. + +.SH COMMANDS + +.TP +.B BEARER +- Show or modify TIPC bearers + +.TP +.B LINK +- Show or modify TIPC links + +.TP +.B MEDIA +- Show or modify TIPC media + +.TP +.B NAMETABLE +- Show TIPC nametable + +.TP +.B NODE +- Show or modify TIPC node parameters + +.TP +.B SOCKET +- Show TIPC sockets + +.SH ARGUMENTS + +Command arguments are described in a command specific man page and typically +consists of nested commands along with key value pairs. +If no arguments are given a command typically shows its help text. The explicit +help option +.B -h +or +.B --help +can occur anywhere among the arguments and will show help for the last valid +command given. + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc-bearer (8), +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe From 09a50f420bbf83546bfc685b2afa9435608eba1e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 7 Oct 2015 10:33:39 +0100 Subject: [PATCH 004/151] add tipc manpages to Makefile --- man/man8/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/man/man8/Makefile b/man/man8/Makefile index 9ffdaee3..1845987d 100644 --- a/man/man8/Makefile +++ b/man/man8/Makefile @@ -10,7 +10,9 @@ MAN8PAGES = $(TARGETS) ip.8 arpd.8 lnstat.8 routel.8 rtacct.8 rtmon.8 rtpr.8 ss. ip-addrlabel.8 ip-fou.8 ip-gue.8 ip-l2tp.8 \ ip-maddress.8 ip-monitor.8 ip-mroute.8 ip-neighbour.8 \ ip-netns.8 ip-ntable.8 ip-rule.8 ip-tunnel.8 ip-xfrm.8 \ - ip-tcp_metrics.8 ip-netconf.8 ip-token.8 + ip-tcp_metrics.8 ip-netconf.8 ip-token.8 \ + tipc.8 tipc-bearer.8 tipc-link.8 tipc-media.8 tipc-nametable.8 \ + tipc-node.8 tipc-socket.8 all: $(TARGETS) From 8aacb9bbbd741c2873ee90118264ae2d5bbd9ae3 Mon Sep 17 00:00:00 2001 From: Christoph Schulz Date: Fri, 25 Sep 2015 08:44:07 +0200 Subject: [PATCH 005/151] ip: allow using a device "help" (or a prefix thereof) Device names that match "help" or a prefix thereof should be allowed anywhere a device name can be used. Note that a suitable keyword ("dev" or "name", the latter for "ip tunnel") has to be used in these cases to resolve ambiguities. Signed-off-by: Christoph Schulz Reported-by: Leonhard Preis Reported-by: Wilhelm Wijkander --- ip/ip6tunnel.c | 2 +- ip/ipaddress.c | 2 +- ip/iplink.c | 3 +-- ip/ipmaddr.c | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c index 62a8240f..9884efd4 100644 --- a/ip/ip6tunnel.c +++ b/ip/ip6tunnel.c @@ -287,7 +287,7 @@ static int parse_args(int argc, char **argv, int cmd, struct ip6_tnl_parm2 *p) if (strcmp(*argv, "name") == 0) { NEXT_ARG(); } - if (matches(*argv, "help") == 0) + else if (matches(*argv, "help") == 0) usage(); if (p->name[0]) duparg2("name", *argv); diff --git a/ip/ipaddress.c b/ip/ipaddress.c index e5398472..e864ca65 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -1580,7 +1580,7 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action) if (strcmp(*argv, "dev") == 0) { NEXT_ARG(); } - if (matches(*argv, "help") == 0) + else if (matches(*argv, "help") == 0) usage(); if (filter_dev) duparg2("dev", *argv); diff --git a/ip/iplink.c b/ip/iplink.c index 1c452055..f30de86d 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -1148,8 +1148,7 @@ static int do_set(int argc, char **argv) } else { if (strcmp(*argv, "dev") == 0) NEXT_ARG(); - - if (matches(*argv, "help") == 0) + else if (matches(*argv, "help") == 0) usage(); if (dev) diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c index a77a18fb..cbd6d115 100644 --- a/ip/ipmaddr.c +++ b/ip/ipmaddr.c @@ -257,7 +257,7 @@ static int multiaddr_list(int argc, char **argv) if (strcmp(*argv, "dev") == 0) { NEXT_ARG(); } - if (matches(*argv, "help") == 0) + else if (matches(*argv, "help") == 0) usage(); if (filter.dev) duparg2("dev", *argv); From 39e3d3836c1384506d0a76a496133c5361940770 Mon Sep 17 00:00:00 2001 From: Christophe Gouault Date: Fri, 2 Oct 2015 11:59:37 +0200 Subject: [PATCH 006/151] batch: support quoted strings Support quoting strings with " or ' in an iproute2 batch file. Enables to configure empty crypto keys (for ESP-null) or keys with spaces: xfrm state add src 1.1.1.1 dst 2.2.2.2 proto ah spi 0x1 \ mode tunnel auth hmac(sha1) "r4ezR/@kd6'749f2 6zf$" xfrm state add src 5.5.5.5 dst 2.2.2.2 proto esp spi 0x2 \ mode tunnel enc cipher_null "" Signed-off-by: Christophe Gouault --- lib/utils.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/utils.c b/lib/utils.c index 29b4f548..107e3f57 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -914,12 +914,31 @@ int makeargs(char *line, char *argv[], int maxargs) char *cp; int argc = 0; - for (cp = strtok(line, ws); cp; cp = strtok(NULL, ws)) { + for (cp = line + strspn(line, ws); *cp; cp += strspn(cp, ws)) { if (argc >= (maxargs - 1)) { fprintf(stderr, "Too many arguments to command\n"); exit(1); } + + /* word begins with quote */ + if (*cp == '\'' || *cp == '"') { + char quote = *cp++; + + argv[argc++] = cp; + /* find ending quote */ + cp = strchr(cp, quote); + if (cp == NULL) { + fprintf(stderr, "Unterminated quoted string\n"); + exit(1); + } + *cp++ = 0; + continue; + } + argv[argc++] = cp; + /* find end of word */ + cp += strcspn(cp, ws); + *cp++ = 0; } argv[argc] = NULL; From 6b53cb66e8fba101e9982c46f93ee095a8e8e709 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 12 Oct 2015 09:22:29 -0700 Subject: [PATCH 007/151] update kernel headers --- include/linux/if_ether.h | 1 + include/linux/tc_act/tc_skbedit.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index 4678e499..bf278d65 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -42,6 +42,7 @@ #define ETH_P_LOOP 0x0060 /* Ethernet Loopback packet */ #define ETH_P_PUP 0x0200 /* Xerox PUP packet */ #define ETH_P_PUPAT 0x0201 /* Xerox PUP Addr Trans packet */ +#define ETH_P_TSN 0x22F0 /* TSN (IEEE 1722) packet */ #define ETH_P_IP 0x0800 /* Internet Protocol packet */ #define ETH_P_X25 0x0805 /* CCITT X.25 */ #define ETH_P_ARP 0x0806 /* Address Resolution packet */ diff --git a/include/linux/tc_act/tc_skbedit.h b/include/linux/tc_act/tc_skbedit.h index 73026b35..7a2e910a 100644 --- a/include/linux/tc_act/tc_skbedit.h +++ b/include/linux/tc_act/tc_skbedit.h @@ -11,7 +11,8 @@ * more details. * * You should have received a copy of the GNU General Public License along with - * this program; if not, see . + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. * * Author: Alexander Duyck */ From 9de8c6d9765f284f8f15ccbe4af791259afe707e Mon Sep 17 00:00:00 2001 From: Wilson Kok Date: Sun, 11 Oct 2015 14:03:03 -0700 Subject: [PATCH 008/151] bridge: add batch command support This patch adds support to batch bridge commands. Follows ip batch code. Signed-off-by: Wilson Kok Signed-off-by: Roopa Prabhu Acked-by: Christophe Gouault --- bridge/bridge.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++ man/man8/bridge.8 | 11 +++++++++ 2 files changed, 70 insertions(+) diff --git a/bridge/bridge.c b/bridge/bridge.c index eaf09c84..72f153f2 100644 --- a/bridge/bridge.c +++ b/bridge/bridge.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "SNAPSHOT.h" #include "utils.h" @@ -23,6 +24,8 @@ int show_stats; int show_details; int compress_vlans; int timestamp; +char *batch_file; +int force; const char *_SL_; static void usage(void) __attribute__((noreturn)); @@ -31,6 +34,7 @@ static void usage(void) { fprintf(stderr, "Usage: bridge [ OPTIONS ] OBJECT { COMMAND | help }\n" +" bridge [ -force ] -batch filename\n" "where OBJECT := { link | fdb | mdb | vlan | monitor }\n" " OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] |\n" " -o[neline] | -t[imestamp] | -n[etns] name |\n" @@ -71,6 +75,50 @@ static int do_cmd(const char *argv0, int argc, char **argv) return -1; } +static int batch(const char *name) +{ + char *line = NULL; + size_t len = 0; + int ret = EXIT_SUCCESS; + + if (name && strcmp(name, "-") != 0) { + if (freopen(name, "r", stdin) == NULL) { + fprintf(stderr, + "Cannot open file \"%s\" for reading: %s\n", + name, strerror(errno)); + return EXIT_FAILURE; + } + } + + if (rtnl_open(&rth, 0) < 0) { + fprintf(stderr, "Cannot open rtnetlink\n"); + return EXIT_FAILURE; + } + + cmdlineno = 0; + while (getcmdline(&line, &len, stdin) != -1) { + char *largv[100]; + int largc; + + largc = makeargs(line, largv, 100); + if (largc == 0) + continue; /* blank line */ + + if (do_cmd(largv[0], largc, largv)) { + fprintf(stderr, "Command failed %s:%d\n", + name, cmdlineno); + ret = EXIT_FAILURE; + if (!force) + break; + } + } + if (line) + free(line); + + rtnl_close(&rth); + return ret; +} + int main(int argc, char **argv) { @@ -123,6 +171,14 @@ main(int argc, char **argv) exit(-1); } else if (matches(opt, "-compressvlans") == 0) { ++compress_vlans; + } else if (matches(opt, "-force") == 0) { + ++force; + } else if (matches(opt, "-batch") == 0) { + argc--; + argv++; + if (argc <= 1) + usage(); + batch_file = argv[1]; } else { fprintf(stderr, "Option \"%s\" is unknown, try \"bridge help\".\n", @@ -134,6 +190,9 @@ main(int argc, char **argv) _SL_ = oneline ? "\\" : "\n"; + if (batch_file) + return batch(batch_file); + if (rtnl_open(&rth, 0) < 0) exit(1); diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index 5347a569..d45c7289 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -21,6 +21,7 @@ bridge \- show / manipulate bridge addresses and devices \fB\-V\fR[\fIersion\fR] | \fB\-s\fR[\fItatistics\fR] | \fB\-n\fR[\fIetns\fR] name } +\fB\-b\fR[\fIatch\fR] filename } .ti -8 .BR "bridge link set" @@ -137,6 +138,16 @@ to .RI "-n[etns] " NETNS " [ " OPTIONS " ] " OBJECT " { " COMMAND " | " .BR help " }" +.TP +.BR "\-b", " \-batch " +Read commands from provided file or standard input and invoke them. +First failure will cause termination of bridge command. + +.TP +.BR "\-force" +Don't terminate bridge command on errors in batch mode. +If there were any errors during execution of the commands, the application +return code will be non zero. .SH BRIDGE - COMMAND SYNTAX From 25bc3d3d4aac6d0ab20a01d922b8e8ad6eb2232e Mon Sep 17 00:00:00 2001 From: Satish Ashok Date: Mon, 12 Oct 2015 09:28:55 -0700 Subject: [PATCH 009/151] ip, bridge: document -timestamp option This patch documents bridge and ip -timestamp option Signed-off-by: Satish Ashok --- man/man8/bridge.8 | 3 +++ man/man8/ip.8 | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index d45c7289..222a4381 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -308,6 +308,9 @@ link setting is configured on specified physical device .BI master link setting is configured on the software bridge (default) +.TP +.BR "\-t" , " \-timestamp" +display current time when using monitor option. .SS bridge link show - list bridge port configuration. diff --git a/man/man8/ip.8 b/man/man8/ip.8 index e6c2b32a..1bdee118 100644 --- a/man/man8/ip.8 +++ b/man/man8/ip.8 @@ -175,6 +175,10 @@ executes specified command over all objects, it depends if command supports this .BR "\-c" , " -color" Use color output. +.TP +.BR "\-t" , " \-timestamp" +display current time when using monitor option. + .SH IP - COMMAND SYNTAX .SS From 23e905096c45b5be4020a3adb5c8d0a789d30cc4 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 12 Oct 2015 09:32:44 -0700 Subject: [PATCH 010/151] update kernel headers for net-next --- include/linux/bpf.h | 7 ++++++ include/linux/if_bridge.h | 1 + include/linux/if_link.h | 49 +++++++++++++++++++++++++++++++++++++++ include/linux/netlink.h | 1 + 4 files changed, 58 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ae08b735..689be2e8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -280,6 +280,13 @@ enum bpf_func_id { * Return: TC_ACT_REDIRECT */ BPF_FUNC_redirect, + + /** + * bpf_get_route_realm(skb) - retrieve a dst's tclassid + * @skb: pointer to skb + * Return: realm if != 0 + */ + BPF_FUNC_get_route_realm, __BPF_FUNC_MAX_ID, }; diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index f24050ba..ee197a37 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -127,6 +127,7 @@ enum { #define BRIDGE_VLAN_INFO_UNTAGGED (1<<2) /* VLAN egresses untagged */ #define BRIDGE_VLAN_INFO_RANGE_BEGIN (1<<3) /* VLAN is start of vlan range */ #define BRIDGE_VLAN_INFO_RANGE_END (1<<4) /* VLAN is end of vlan range */ +#define BRIDGE_VLAN_INFO_BRENTRY (1<<5) /* Global bridge VLAN entry */ struct bridge_vlan_info { __u16 flags; diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 19345666..288d3cd6 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -230,11 +230,47 @@ enum { IFLA_BR_PRIORITY, IFLA_BR_VLAN_FILTERING, IFLA_BR_VLAN_PROTOCOL, + IFLA_BR_GROUP_FWD_MASK, + IFLA_BR_ROOT_ID, + IFLA_BR_BRIDGE_ID, + IFLA_BR_ROOT_PORT, + IFLA_BR_ROOT_PATH_COST, + IFLA_BR_TOPOLOGY_CHANGE, + IFLA_BR_TOPOLOGY_CHANGE_DETECTED, + IFLA_BR_HELLO_TIMER, + IFLA_BR_TCN_TIMER, + IFLA_BR_TOPOLOGY_CHANGE_TIMER, + IFLA_BR_GC_TIMER, + IFLA_BR_GROUP_ADDR, + IFLA_BR_FDB_FLUSH, + IFLA_BR_MCAST_ROUTER, + IFLA_BR_MCAST_SNOOPING, + IFLA_BR_MCAST_QUERY_USE_IFADDR, + IFLA_BR_MCAST_QUERIER, + IFLA_BR_MCAST_HASH_ELASTICITY, + IFLA_BR_MCAST_HASH_MAX, + IFLA_BR_MCAST_LAST_MEMBER_CNT, + IFLA_BR_MCAST_STARTUP_QUERY_CNT, + IFLA_BR_MCAST_LAST_MEMBER_INTVL, + IFLA_BR_MCAST_MEMBERSHIP_INTVL, + IFLA_BR_MCAST_QUERIER_INTVL, + IFLA_BR_MCAST_QUERY_INTVL, + IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, + IFLA_BR_MCAST_STARTUP_QUERY_INTVL, + IFLA_BR_NF_CALL_IPTABLES, + IFLA_BR_NF_CALL_IP6TABLES, + IFLA_BR_NF_CALL_ARPTABLES, + IFLA_BR_VLAN_DEFAULT_PVID, __IFLA_BR_MAX, }; #define IFLA_BR_MAX (__IFLA_BR_MAX - 1) +struct ifla_bridge_id { + __u8 prio[2]; + __u8 addr[6]; /* ETH_ALEN */ +}; + enum { BRIDGE_MODE_UNSPEC, BRIDGE_MODE_HAIRPIN, @@ -254,6 +290,19 @@ enum { IFLA_BRPORT_PROXYARP, /* proxy ARP */ IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */ IFLA_BRPORT_PROXYARP_WIFI, /* proxy ARP for Wi-Fi */ + IFLA_BRPORT_ROOT_ID, /* designated root */ + IFLA_BRPORT_BRIDGE_ID, /* designated bridge */ + IFLA_BRPORT_DESIGNATED_PORT, + IFLA_BRPORT_DESIGNATED_COST, + IFLA_BRPORT_ID, + IFLA_BRPORT_NO, + IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, + IFLA_BRPORT_CONFIG_PENDING, + IFLA_BRPORT_MESSAGE_AGE_TIMER, + IFLA_BRPORT_FORWARD_DELAY_TIMER, + IFLA_BRPORT_HOLD_TIMER, + IFLA_BRPORT_FLUSH, + IFLA_BRPORT_MULTICAST_ROUTER, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 352b5b88..8a7ca5c6 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -54,6 +54,7 @@ struct nlmsghdr { #define NLM_F_ACK 4 /* Reply with ack, with zero or error code */ #define NLM_F_ECHO 8 /* Echo this request */ #define NLM_F_DUMP_INTR 16 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 32 /* Dump was filtered as requested */ /* Modifiers to GET request */ #define NLM_F_ROOT 0x100 /* specify tree root */ From 0d238ca2b8b79e31dbbae78875f4293e4f04de11 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Oct 2015 09:42:27 -0700 Subject: [PATCH 011/151] ip neigh: Add support for filtering dumps by master device Add support for filtering neighbor dumps by master device. Kernel side support provided by commit 21fdd092acc7. Since the feature is not available in older kernels the user is given a warning message if the kernel does not support the request. Signed-off-by: David Ahern --- include/libnetlink.h | 2 ++ ip/ipneigh.c | 35 +++++++++++++++++++++++++++++++---- lib/libnetlink.c | 21 +++++++++++++++++++++ 3 files changed, 54 insertions(+), 4 deletions(-) diff --git a/include/libnetlink.h b/include/libnetlink.h index 0503dea5..48133591 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -42,6 +42,8 @@ int rtnl_wilddump_req_filter(struct rtnl_handle *rth, int fam, int type, int rtnl_dump_request(struct rtnl_handle *rth, int type, void *req, int len) __attribute__((warn_unused_result)); +int rtnl_dump_request_n(struct rtnl_handle *rth, struct nlmsghdr *n) + __attribute__((warn_unused_result)); struct rtnl_ctrl_data { int nsid; diff --git a/ip/ipneigh.c b/ip/ipneigh.c index a9e23f45..b8973a2d 100644 --- a/ip/ipneigh.c +++ b/ip/ipneigh.c @@ -39,6 +39,7 @@ static struct char *flushb; int flushp; int flushe; + int master; } filter; static void usage(void) __attribute__((noreturn)); @@ -193,6 +194,7 @@ int print_neigh(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) int len = n->nlmsg_len; struct rtattr * tb[NDA_MAX+1]; char abuf[256]; + static int logit = 1; if (n->nlmsg_type != RTM_NEWNEIGH && n->nlmsg_type != RTM_DELNEIGH && n->nlmsg_type != RTM_GETNEIGH) { @@ -220,6 +222,14 @@ int print_neigh(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) (r->ndm_family != AF_DECnet)) return 0; + if (filter.master && !(n->nlmsg_flags & NLM_F_DUMP_FILTERED)) { + if (logit) { + logit = 0; + fprintf(fp, + "\nWARNING: Kernel does not support filtering by master device\n\n"); + } + } + parse_rtattr(tb, NDA_MAX, NDA_RTA(r), n->nlmsg_len - NLMSG_LENGTH(sizeof(*r))); if (tb[NDA_DST]) { @@ -327,9 +337,18 @@ void ipneigh_reset_filter(int ifindex) static int do_show_or_flush(int argc, char **argv, int flush) { + struct { + struct nlmsghdr n; + struct ndmsg ndm; + char buf[256]; + } req; char *filter_dev = NULL; int state_given = 0; - struct ndmsg ndm = { 0 }; + + memset(&req, 0, sizeof(req)); + + req.n.nlmsg_type = RTM_GETNEIGH; + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)); ipneigh_reset_filter(0); @@ -351,6 +370,14 @@ static int do_show_or_flush(int argc, char **argv, int flush) if (filter_dev) duparg("dev", *argv); filter_dev = *argv; + } else if (strcmp(*argv, "master") == 0) { + int ifindex; + NEXT_ARG(); + ifindex = ll_name_to_index(*argv); + if (!ifindex) + invarg("Device does not exist\n", *argv); + addattr32(&req.n, sizeof(req), NDA_MASTER, ifindex); + filter.master = ifindex; } else if (strcmp(*argv, "unused") == 0) { filter.unused_only = 1; } else if (strcmp(*argv, "nud") == 0) { @@ -371,7 +398,7 @@ static int do_show_or_flush(int argc, char **argv, int flush) state = 0x100; filter.state |= state; } else if (strcmp(*argv, "proxy") == 0) - ndm.ndm_flags = NTF_PROXY; + req.ndm.ndm_flags = NTF_PROXY; else { if (strcmp(*argv, "to") == 0) { NEXT_ARG(); @@ -436,9 +463,9 @@ static int do_show_or_flush(int argc, char **argv, int flush) return 1; } - ndm.ndm_family = filter.family; + req.ndm.ndm_family = filter.family; - if (rtnl_dump_request(&rth, RTM_GETNEIGH, &ndm, sizeof(struct ndmsg)) < 0) { + if (rtnl_dump_request_n(&rth, &req.n) < 0) { perror("Cannot send dump request"); exit(1); } diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 46cac34c..8e3762c1 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -191,6 +191,27 @@ int rtnl_dump_request(struct rtnl_handle *rth, int type, void *req, int len) return sendmsg(rth->fd, &msg, 0); } +int rtnl_dump_request_n(struct rtnl_handle *rth, struct nlmsghdr *n) +{ + struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; + struct iovec iov = { + .iov_base = (void*) n, + .iov_len = n->nlmsg_len + }; + struct msghdr msg = { + .msg_name = &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + + n->nlmsg_flags = NLM_F_DUMP|NLM_F_REQUEST; + n->nlmsg_pid = 0; + n->nlmsg_seq = rth->dump = ++rth->seq; + + return sendmsg(rth->fd, &msg, 0); +} + int rtnl_dump_filter_l(struct rtnl_handle *rth, const struct rtnl_dump_filter_arg *arg) { From faa8a463002fb9a365054dd333556e0aaa022759 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 25 Sep 2015 12:32:41 +0200 Subject: [PATCH 012/151] f_bpf: allow for optional classid and add flags When having optional classid, most minimal command can be sth like: tc filter add dev foo parent X: bpf obj prog.o Therefore, adapt the code so that a next argument will not be enforced as the case currently. Also, minor cleanup on the classid, where we should rather have used addattr32(), and add flags for exec configuration, for example (using short notation): tc filter add dev foo parent X: bpf da obj prog.o Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- include/utils.h | 1 + tc/f_bpf.c | 43 ++++++++++++++++++++++++++++++------------- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/include/utils.h b/include/utils.h index f77edeb0..668d159c 100644 --- a/include/utils.h +++ b/include/utils.h @@ -47,6 +47,7 @@ void incomplete_command(void) __attribute__((noreturn)); #define NEXT_ARG() do { argv++; if (--argc <= 0) incomplete_command(); } while(0) #define NEXT_ARG_OK() (argc - 1 > 0) +#define NEXT_ARG_FWD() do { argv++; argc--; } while(0) #define PREV_ARG() do { argv--; argc++; } while(0) typedef struct diff --git a/tc/f_bpf.c b/tc/f_bpf.c index 490dc6b4..ac77af58 100644 --- a/tc/f_bpf.c +++ b/tc/f_bpf.c @@ -41,7 +41,7 @@ static void explain(void) fprintf(stderr, "\n"); fprintf(stderr, "eBPF use case:\n"); fprintf(stderr, " object-file FILE [ section CLS_NAME ] [ export UDS_FILE ]"); - fprintf(stderr, " [ verbose ]\n"); + fprintf(stderr, " [ verbose ] [ direct-action ]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Common remaining options:\n"); fprintf(stderr, " [ action ACTION_SPEC ]\n"); @@ -69,6 +69,7 @@ static int bpf_parse_opt(struct filter_util *qu, char *handle, struct tcmsg *t = NLMSG_DATA(n); const char *bpf_uds_name = NULL; const char *bpf_sec_name = NULL; + unsigned int bpf_flags = 0; char *bpf_obj = NULL; struct rtattr *tail; bool seen_run = false; @@ -124,25 +125,28 @@ opt_bpf: if (ebpf) { bpf_uds_name = getenv(BPF_ENV_UDS); bpf_obj = *argv; - NEXT_ARG(); - if (strcmp(*argv, "section") == 0 || - strcmp(*argv, "sec") == 0) { + NEXT_ARG_FWD(); + + if (argc > 0 && + (strcmp(*argv, "section") == 0 || + strcmp(*argv, "sec") == 0)) { NEXT_ARG(); bpf_sec_name = *argv; - NEXT_ARG(); + NEXT_ARG_FWD(); } - if (!bpf_uds_name && + if (argc > 0 && !bpf_uds_name && (strcmp(*argv, "export") == 0 || strcmp(*argv, "exp") == 0)) { NEXT_ARG(); bpf_uds_name = *argv; - NEXT_ARG(); + NEXT_ARG_FWD(); } - if (strcmp(*argv, "verbose") == 0 || - strcmp(*argv, "verb") == 0) { + if (argc > 0 && + (strcmp(*argv, "verbose") == 0 || + strcmp(*argv, "verb") == 0)) { bpf_verbose = true; - NEXT_ARG(); + NEXT_ARG_FWD(); } PREV_ARG(); @@ -182,7 +186,10 @@ opt_bpf: fprintf(stderr, "Illegal \"classid\"\n"); return -1; } - addattr_l(n, MAX_MSG, TCA_BPF_CLASSID, &handle, 4); + addattr32(n, MAX_MSG, TCA_BPF_CLASSID, handle); + } else if (matches(*argv, "direct-action") == 0 || + matches(*argv, "da") == 0) { + bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT; } else if (matches(*argv, "action") == 0) { NEXT_ARG(); if (parse_action(&argc, &argv, TCA_BPF_ACT, n)) { @@ -208,10 +215,13 @@ opt_bpf: explain(); return -1; } - argc--; - argv++; + + NEXT_ARG_FWD(); } + if (bpf_obj && bpf_flags) + addattr32(n, MAX_MSG, TCA_BPF_FLAGS, bpf_flags); + tail->rta_len = (((void *)n) + n->nlmsg_len) - (void *)tail; if (bpf_uds_name) @@ -244,6 +254,13 @@ static int bpf_print_opt(struct filter_util *qu, FILE *f, else if (tb[TCA_BPF_FD]) fprintf(f, "pfd %u ", rta_getattr_u32(tb[TCA_BPF_FD])); + if (tb[TCA_BPF_FLAGS]) { + unsigned int flags = rta_getattr_u32(tb[TCA_BPF_FLAGS]); + + if (flags & TCA_BPF_FLAG_ACT_DIRECT) + fprintf(f, "direct-action "); + } + if (tb[TCA_BPF_OPS] && tb[TCA_BPF_OPS_LEN]) { bpf_print_ops(f, tb[TCA_BPF_OPS], rta_getattr_u16(tb[TCA_BPF_OPS_LEN])); From b8c753245bad3f13a03b105b724ff406d278c753 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 7 Oct 2015 10:23:24 -0700 Subject: [PATCH 013/151] ip neigh: Add ifindex to request when filtering dumps by device Add ifindex to dump request when filtering by device. If the kernel supports it adding the index to the request limits the amount of data the kernel pushes to userpsace. The feature exists in userspace already, so no need to warn the user if kernel side support does not exist. Using the kernel side filter makes the request more efficient. Signed-off-by: David Ahern --- ip/ipneigh.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ip/ipneigh.c b/ip/ipneigh.c index b8973a2d..ded514da 100644 --- a/ip/ipneigh.c +++ b/ip/ipneigh.c @@ -419,6 +419,7 @@ static int do_show_or_flush(int argc, char **argv, int flush) fprintf(stderr, "Cannot find device \"%s\"\n", filter_dev); return -1; } + addattr32(&req.n, sizeof(req), NDA_IFINDEX, filter.index); } if (flush) { From 343dc90854978170842351683a1086e8ec9f53ed Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 8 Oct 2015 15:22:05 +0200 Subject: [PATCH 014/151] m_bpf: don't require default opcode on ebpf actions After the patch, the most minimal command to load an eBPF action for late binding with auto index selection through tc is: tc actions add action bpf obj prog.o We already set TC_ACT_PIPE in tc as default opcode, so if nothing further has been specified, just use it. Also, allow "ok" next to "pass" for matching cmdline on TC_ACT_OK. Signed-off-by: Daniel Borkmann --- tc/m_bpf.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/tc/m_bpf.c b/tc/m_bpf.c index e1bb6a49..fb4c3c7f 100644 --- a/tc/m_bpf.c +++ b/tc/m_bpf.c @@ -111,25 +111,28 @@ opt_bpf: if (ebpf) { bpf_uds_name = getenv(BPF_ENV_UDS); bpf_obj = *argv; - NEXT_ARG(); - if (strcmp(*argv, "section") == 0 || - strcmp(*argv, "sec") == 0) { + NEXT_ARG_FWD(); + + if (argc > 0 && + (strcmp(*argv, "section") == 0 || + strcmp(*argv, "sec") == 0)) { NEXT_ARG(); bpf_sec_name = *argv; - NEXT_ARG(); + NEXT_ARG_FWD(); } - if (!bpf_uds_name && + if (argc > 0 && !bpf_uds_name && (strcmp(*argv, "export") == 0 || strcmp(*argv, "exp") == 0)) { NEXT_ARG(); bpf_uds_name = *argv; - NEXT_ARG(); + NEXT_ARG_FWD(); } - if (strcmp(*argv, "verbose") == 0 || - strcmp(*argv, "verb") == 0) { + if (argc > 0 && + (strcmp(*argv, "verbose") == 0 || + strcmp(*argv, "verb") == 0)) { bpf_verbose = true; - NEXT_ARG(); + NEXT_ARG_FWD(); } PREV_ARG(); @@ -166,33 +169,29 @@ opt_bpf: goto opt_bpf; break; } - argc--; - argv++; + + NEXT_ARG_FWD(); } parm.action = TC_ACT_PIPE; if (argc) { if (matches(*argv, "reclassify") == 0) { parm.action = TC_ACT_RECLASSIFY; - argc--; - argv++; + NEXT_ARG_FWD(); } else if (matches(*argv, "pipe") == 0) { parm.action = TC_ACT_PIPE; - argc--; - argv++; + NEXT_ARG_FWD(); } else if (matches(*argv, "drop") == 0 || matches(*argv, "shot") == 0) { parm.action = TC_ACT_SHOT; - argc--; - argv++; + NEXT_ARG_FWD(); } else if (matches(*argv, "continue") == 0) { parm.action = TC_ACT_UNSPEC; - argc--; - argv++; - } else if (matches(*argv, "pass") == 0) { + NEXT_ARG_FWD(); + } else if (matches(*argv, "pass") == 0 || + matches(*argv, "ok") == 0) { parm.action = TC_ACT_OK; - argc--; - argv++; + NEXT_ARG_FWD(); } } @@ -203,8 +202,8 @@ opt_bpf: fprintf(stderr, "bpf: Illegal \"index\"\n"); return -1; } - argc--; - argv++; + + NEXT_ARG_FWD(); } } From 541f1b3e1d1011718cbef85adb284a72b183c676 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 25 Sep 2015 14:09:49 +0200 Subject: [PATCH 015/151] ip: link: consolidate macvlan and macvtap After eliminating the minor differences in both files which existed solely because features/fixes were applied to only one of them and not the other, the remaining differences were in function naming and error messages. The latter is addressed by using the 'id' field of struct link_util. Fold both files into one in order to share common code and eliminate the chance of having fixes/enhancements applied to only one of them. Signed-off-by: Phil Sutter --- ip/Makefile | 2 +- ip/iplink_macvlan.c | 40 +++++++++++------ ip/iplink_macvtap.c | 105 -------------------------------------------- 3 files changed, 28 insertions(+), 119 deletions(-) delete mode 100644 ip/iplink_macvtap.c diff --git a/ip/Makefile b/ip/Makefile index d8b38ac2..52b76efb 100644 --- a/ip/Makefile +++ b/ip/Makefile @@ -3,7 +3,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \ ipmaddr.o ipmonitor.o ipmroute.o ipprefix.o iptuntap.o iptoken.o \ ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o \ iplink_vlan.o link_veth.o link_gre.o iplink_can.o \ - iplink_macvlan.o iplink_macvtap.o ipl2tp.o link_vti.o link_vti6.o \ + iplink_macvlan.o ipl2tp.o link_vti.o link_vti6.o \ iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o \ link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \ iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o \ diff --git a/ip/iplink_macvlan.c b/ip/iplink_macvlan.c index 826b6591..d759f0e1 100644 --- a/ip/iplink_macvlan.c +++ b/ip/iplink_macvlan.c @@ -1,5 +1,5 @@ /* - * iplink_vlan.c VLAN device support + * iplink_macvlan.c macvlan/macvtap device support * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -20,22 +20,29 @@ #include "utils.h" #include "ip_common.h" -static void print_explain(FILE *f) +#define pfx_err(lu, ...) { \ + fprintf(stderr, "%s: ", lu->id); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ +} + +static void print_explain(struct link_util *lu, FILE *f) { fprintf(f, - "Usage: ... macvlan mode { private | vepa | bridge | passthru }\n" + "Usage: ... %s mode { private | vepa | bridge | passthru }\n", + lu->id ); } -static void explain(void) +static void explain(struct link_util *lu) { - print_explain(stderr); + print_explain(lu, stderr); } -static int mode_arg(void) +static int mode_arg(const char *arg) { fprintf(stderr, "Error: argument of \"mode\" must be \"private\", " - "\"vepa\", \"bridge\" or \"passthru\" \n"); + "\"vepa\", \"bridge\" or \"passthru\", not \"%s\"\n", arg); return -1; } @@ -56,15 +63,14 @@ static int macvlan_parse_opt(struct link_util *lu, int argc, char **argv, else if (strcmp(*argv, "passthru") == 0) mode = MACVLAN_MODE_PASSTHRU; else - return mode_arg(); - + return mode_arg(*argv); addattr32(n, 1024, IFLA_MACVLAN_MODE, mode); } else if (matches(*argv, "help") == 0) { - explain(); + explain(lu); return -1; } else { - fprintf(stderr, "macvlan: unknown option \"%s\"?\n", *argv); - explain(); + pfx_err(lu, "unknown option \"%s\"?", *argv); + explain(lu); return -1; } argc--, argv++; @@ -96,7 +102,7 @@ static void macvlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[] static void macvlan_print_help(struct link_util *lu, int argc, char **argv, FILE *f) { - print_explain(f); + print_explain(lu, f); } struct link_util macvlan_link_util = { @@ -106,3 +112,11 @@ struct link_util macvlan_link_util = { .print_opt = macvlan_print_opt, .print_help = macvlan_print_help, }; + +struct link_util macvtap_link_util = { + .id = "macvtap", + .maxattr = IFLA_MACVLAN_MAX, + .parse_opt = macvlan_parse_opt, + .print_opt = macvlan_print_opt, + .print_help = macvlan_print_help, +}; diff --git a/ip/iplink_macvtap.c b/ip/iplink_macvtap.c deleted file mode 100644 index 9c2cd74d..00000000 --- a/ip/iplink_macvtap.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * iplink_macvtap.c macvtap device support - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include "rt_names.h" -#include "utils.h" -#include "ip_common.h" - -static void print_explain(FILE *f) -{ - fprintf(stderr, - "Usage: ... macvtap mode { private | vepa | bridge | passthru }\n" - ); -} - -static void explain(void) -{ - print_explain(stderr); -} - -static int mode_arg(const char *arg) -{ - fprintf(stderr, "Error: argument of \"mode\" must be \"private\", " - "\"vepa\", \"bridge\" or \"passthru\", not \"%s\"\n", arg); - return -1; -} - -static int macvtap_parse_opt(struct link_util *lu, int argc, char **argv, - struct nlmsghdr *n) -{ - while (argc > 0) { - if (matches(*argv, "mode") == 0) { - __u32 mode = 0; - NEXT_ARG(); - - if (strcmp(*argv, "private") == 0) - mode = MACVLAN_MODE_PRIVATE; - else if (strcmp(*argv, "vepa") == 0) - mode = MACVLAN_MODE_VEPA; - else if (strcmp(*argv, "bridge") == 0) - mode = MACVLAN_MODE_BRIDGE; - else if (strcmp(*argv, "passthru") == 0) - mode = MACVLAN_MODE_PASSTHRU; - else - return mode_arg(*argv); - - addattr32(n, 1024, IFLA_MACVLAN_MODE, mode); - } else if (matches(*argv, "help") == 0) { - explain(); - return -1; - } else { - fprintf(stderr, "macvtap: unknown command \"%s\"?\n", *argv); - explain(); - return -1; - } - argc--, argv++; - } - - return 0; -} - -static void macvtap_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) -{ - __u32 mode; - - if (!tb) - return; - - if (!tb[IFLA_MACVLAN_MODE] || - RTA_PAYLOAD(tb[IFLA_MACVLAN_MODE]) < sizeof(__u32)) - return; - - mode = rta_getattr_u32(tb[IFLA_VLAN_ID]); - fprintf(f, " mode %s ", - mode == MACVLAN_MODE_PRIVATE ? "private" - : mode == MACVLAN_MODE_VEPA ? "vepa" - : mode == MACVLAN_MODE_BRIDGE ? "bridge" - : mode == MACVLAN_MODE_PASSTHRU ? "passthru" - : "unknown"); -} - -static void macvtap_print_help(struct link_util *lu, int argc, char **argv, - FILE *f) -{ - print_explain(f); -} - -struct link_util macvtap_link_util = { - .id = "macvtap", - .maxattr = IFLA_MACVLAN_MAX, - .parse_opt = macvtap_parse_opt, - .print_opt = macvtap_print_opt, - .print_help = macvtap_print_help, -}; From 3cf8ba596026b61f58e674fa28f3a310597e8d36 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 25 Sep 2015 14:09:50 +0200 Subject: [PATCH 016/151] ip: macvlan: support MACVLAN_FLAG_NOPROMISC flag This flag is allowed for devices in passthru mode to prevent forcing the underlying interface into promiscuous mode. Signed-off-by: Phil Sutter --- ip/iplink_macvlan.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/ip/iplink_macvlan.c b/ip/iplink_macvlan.c index d759f0e1..f195e81d 100644 --- a/ip/iplink_macvlan.c +++ b/ip/iplink_macvlan.c @@ -29,7 +29,7 @@ static void print_explain(struct link_util *lu, FILE *f) { fprintf(f, - "Usage: ... %s mode { private | vepa | bridge | passthru }\n", + "Usage: ... %s mode { private | vepa | bridge | passthru [nopromisc] }\n", lu->id ); } @@ -49,9 +49,11 @@ static int mode_arg(const char *arg) static int macvlan_parse_opt(struct link_util *lu, int argc, char **argv, struct nlmsghdr *n) { + __u32 mode = 0; + __u16 flags = 0; + while (argc > 0) { if (matches(*argv, "mode") == 0) { - __u32 mode = 0; NEXT_ARG(); if (strcmp(*argv, "private") == 0) @@ -64,7 +66,8 @@ static int macvlan_parse_opt(struct link_util *lu, int argc, char **argv, mode = MACVLAN_MODE_PASSTHRU; else return mode_arg(*argv); - addattr32(n, 1024, IFLA_MACVLAN_MODE, mode); + } else if (matches(*argv, "nopromisc") == 0) { + flags |= MACVLAN_FLAG_NOPROMISC; } else if (matches(*argv, "help") == 0) { explain(lu); return -1; @@ -76,12 +79,25 @@ static int macvlan_parse_opt(struct link_util *lu, int argc, char **argv, argc--, argv++; } + if (mode) + addattr32(n, 1024, IFLA_MACVLAN_MODE, mode); + + if (flags) { + if (flags & MACVLAN_FLAG_NOPROMISC && + mode != MACVLAN_MODE_PASSTHRU) { + pfx_err(lu, "nopromisc flag only valid in passthru mode"); + explain(lu); + return -1; + } + addattr16(n, 1024, IFLA_MACVLAN_FLAGS, flags); + } return 0; } static void macvlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) { __u32 mode; + __u16 flags; if (!tb) return; @@ -97,6 +113,14 @@ static void macvlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[] : mode == MACVLAN_MODE_BRIDGE ? "bridge" : mode == MACVLAN_MODE_PASSTHRU ? "passthru" : "unknown"); + + if (!tb[IFLA_MACVLAN_FLAGS] || + RTA_PAYLOAD(tb[IFLA_MACVLAN_FLAGS]) < sizeof(__u16)) + return; + + flags = rta_getattr_u16(tb[IFLA_MACVLAN_FLAGS]); + if (flags & MACVLAN_FLAG_NOPROMISC) + fprintf(f, "nopromisc "); } static void macvlan_print_help(struct link_util *lu, int argc, char **argv, From a60223bc1c10d4b172caa966acebb5a1620d0d6f Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 25 Sep 2015 14:09:51 +0200 Subject: [PATCH 017/151] man: ip-link: document MACVLAN/MACVTAP interface types Signed-off-by: Phil Sutter --- man/man8/ip-link.8.in | 50 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 4928249d..ac6f4813 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -768,6 +768,56 @@ the following additional arguments are supported: .in -8 +.TP +MACVLAN and MACVTAP Type Support +For a link of type +.I MACVLAN +or +.I MACVTAP +the following additional arguments are supported: + +.BI "ip link add link " DEVICE " name " NAME +.BR type " { " macvlan " | " macvtap " } " +.BR mode " { " private " | " vepa " | " bridge " | " passthru +.BR " [ " nopromisc " ] } " + +.in +8 +.sp +.BR type " { " macvlan " | " macvtap " } " +- specifies the link type to use. +.BR macvlan " creates just a virtual interface, while " +.BR macvtap " in addition creates a character device " +.BR /dev/tapX " to be used just like a " tuntap " device." + +.B mode private +- Do not allow communication between +.B macvlan +instances on the same physical interface, even if the external switch supports +hairpin mode. + +.B mode vepa +- Virtual Ethernet Port Aggregator mode. Data from one +.B macvlan +instance to the other on the same physical interface is transmitted over the +physical interface. Either the attached switch needs to support hairpin mode, +or there must be a TCP/IP router forwarding the packets in order to allow +communication. This is the default mode. + +.B mode bridge +- In bridge mode, all endpoints are directly connected to each other, +communication is not redirected through the physical interface's peer. + +.BR mode " " passthru " [ " nopromisc " ] " +- This mode gives more power to a single endpoint, usually in +.BR macvtap " mode. It is not allowed for more than one endpoint on the same " +physical interface. All traffic will be forwarded to this endpoint, allowing +virtio guests to change MAC address or set promiscuous mode in order to bridge +the interface or create vlan interfaces on top of it. By default, this mode +forces the underlying interface into promiscuous mode. Passing the +.BR nopromisc " flag prevents this, so the promisc flag may be controlled " +using standard tools. +.in -8 + .SS ip link delete - delete virtual link .TP From 0ee9052f1bc2f632b7a181aaaee28584fd82cc18 Mon Sep 17 00:00:00 2001 From: willy tarreau Date: Tue, 6 Oct 2015 12:09:33 +0200 Subject: [PATCH 018/151] fix "ss -p" segfaults I've updated Jose's patch to make it slightly simpler (eg: calloc instead of malloc+memset), and ported it to 4.2.0 which requires it as well, and attached it to this e-mail. I can confirm that with this patch 4.1.1 doesn't segfault on me anymore. The commit message should be reworked I guess though everything's in it and I didn't want to modify his description. Can it be merged as-is or should I reword the commit message and reference Jose as the fix reporter ? We should not let this bug live forever. From: "j.ps@openmailbox.org" Essentially all that is needed to get rid of this issue is the addition of: memset(u, 0, sizeof(*u)); after: if (!(u = malloc(sizeof(*u)))) break; Also patched some other situations (strcpy and sprintf uses) that potentially produce the same results. Signed-off-by: Jose P Santos [ wt: made Jose's patch slightly simpler, all credits to him for the diag ] Signed-off-by: Willy Tarreau --- misc/ss.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/misc/ss.c b/misc/ss.c index 7c3dfa3e..eca4aa35 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -457,7 +457,9 @@ static void user_ent_hash_build(void) user_ent_hash_build_init = 1; - strcpy(name, root); + strncpy(name, root, sizeof(name)-1); + name[sizeof(name)-1] = 0; + if (strlen(name) == 0 || name[strlen(name)-1] != '/') strcat(name, "/"); @@ -481,7 +483,7 @@ static void user_ent_hash_build(void) if (getpidcon(pid, &pid_context) != 0) pid_context = strdup(no_ctx); - sprintf(name + nameoff, "%d/fd/", pid); + snprintf(name + nameoff, sizeof(name) - nameoff, "%d/fd/", pid); pos = strlen(name); if ((dir1 = opendir(name)) == NULL) { free(pid_context); @@ -502,7 +504,7 @@ static void user_ent_hash_build(void) if (sscanf(d1->d_name, "%d%c", &fd, &crap) != 1) continue; - sprintf(name+pos, "%d", fd); + snprintf(name+pos, sizeof(name) - pos, "%d", fd); link_len = readlink(name, lnk, sizeof(lnk)-1); if (link_len == -1) @@ -2738,7 +2740,7 @@ static int unix_show(struct filter *f) struct sockstat *u, **insp; int flags; - if (!(u = malloc(sizeof(*u)))) + if (!(u = calloc(1, sizeof(*u)))) break; u->name = NULL; u->peer_name = NULL; @@ -3088,11 +3090,13 @@ static int netlink_show_one(struct filter *f, strncpy(procname, "kernel", 6); } else if (pid > 0) { FILE *fp; - sprintf(procname, "%s/%d/stat", + snprintf(procname, sizeof(procname), "%s/%d/stat", getenv("PROC_ROOT") ? : "/proc", pid); if ((fp = fopen(procname, "r")) != NULL) { if (fscanf(fp, "%*d (%[^)])", procname) == 1) { - sprintf(procname+strlen(procname), "/%d", pid); + snprintf(procname+strlen(procname), + sizeof(procname)-strlen(procname), + "/%d", pid); done = 1; } fclose(fp); From 303cc9cbeed6cfb9f08a4073f07cb466cc8098e8 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 15 Oct 2015 13:13:38 +0200 Subject: [PATCH 019/151] libnetlink: introduce rta_nest and u8, u16, u64 helpers for nesting within rtattr This patch introduces two new api's rta_nest and rta_nest_end to nest attributes inside a rta attribute represented by 'struct rtattr' as required to construct a nexthop. Also adds rta_addattr* variants for u8, u16 and u64 as needed to support encapsulation. Signed-off-by: Roopa Prabhu Signed-off-by: Thomas Graf Acked-by: Jiri Benc --- include/libnetlink.h | 10 ++++++++++ lib/libnetlink.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/include/libnetlink.h b/include/libnetlink.h index 0503dea5..407440d8 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -86,7 +86,10 @@ int addattr_nest_end(struct nlmsghdr *n, struct rtattr *nest); struct rtattr *addattr_nest_compat(struct nlmsghdr *n, int maxlen, int type, const void *data, int len); int addattr_nest_compat_end(struct nlmsghdr *n, struct rtattr *nest); +int rta_addattr8(struct rtattr *rta, int maxlen, int type, __u8 data); +int rta_addattr16(struct rtattr *rta, int maxlen, int type, __u16 data); int rta_addattr32(struct rtattr *rta, int maxlen, int type, __u32 data); +int rta_addattr64(struct rtattr *rta, int maxlen, int type, __u64 data); int rta_addattr_l(struct rtattr *rta, int maxlen, int type, const void *data, int alen); @@ -98,6 +101,13 @@ int parse_rtattr_byindex(struct rtattr *tb[], int max, struct rtattr *parse_rtattr_one(int type, struct rtattr *rta, int len); int __parse_rtattr_nested_compat(struct rtattr *tb[], int max, struct rtattr *rta, int len); +struct rtattr *rta_nest(struct rtattr *rta, int maxlen, int type); +int rta_nest_end(struct rtattr *rta, struct rtattr *nest); + +#define RTA_TAIL(rta) \ + ((struct rtattr *) (((void *) (rta)) + \ + RTA_ALIGN((rta)->rta_len))) + #define parse_rtattr_nested(tb, max, rta) \ (parse_rtattr((tb), (max), RTA_DATA(rta), RTA_PAYLOAD(rta))) diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 46cac34c..8430033b 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -721,6 +721,37 @@ int rta_addattr_l(struct rtattr *rta, int maxlen, int type, return 0; } +int rta_addattr8(struct rtattr *rta, int maxlen, int type, __u8 data) +{ + return rta_addattr_l(rta, maxlen, type, &data, sizeof(__u8)); +} + +int rta_addattr16(struct rtattr *rta, int maxlen, int type, __u16 data) +{ + return rta_addattr_l(rta, maxlen, type, &data, sizeof(__u16)); +} + +int rta_addattr64(struct rtattr *rta, int maxlen, int type, __u64 data) +{ + return rta_addattr_l(rta, maxlen, type, &data, sizeof(__u64)); +} + +struct rtattr *rta_nest(struct rtattr *rta, int maxlen, int type) +{ + struct rtattr *nest = RTA_TAIL(rta); + + rta_addattr_l(rta, maxlen, type, NULL, 0); + + return nest; +} + +int rta_nest_end(struct rtattr *rta, struct rtattr *nest) +{ + nest->rta_len = (void *)RTA_TAIL(rta) - (void *)nest; + + return rta->rta_len; +} + int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len) { return parse_rtattr_flags(tb, max, rta, len, 0); From 39ca4879a09aa0a8d0ba5695657ff18fbae734c6 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 15 Oct 2015 11:47:43 -0700 Subject: [PATCH 020/151] ip monitor neigh: Change 'delete' to 'Deleted' to be consistent with ip route It helps to grep for one string "Deleted" when monitoring all events. Fixes: 6ea3ebafe077 ("iproute2: inform user when a neighbor is removed") Signed-off-by: Roopa Prabhu Acked-by: Nicolas Dichtel --- ip/ipneigh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/ipneigh.c b/ip/ipneigh.c index a9e23f45..ce57edeb 100644 --- a/ip/ipneigh.c +++ b/ip/ipneigh.c @@ -256,7 +256,7 @@ int print_neigh(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) } if (n->nlmsg_type == RTM_DELNEIGH) - fprintf(fp, "delete "); + fprintf(fp, "Deleted "); else if (n->nlmsg_type == RTM_GETNEIGH) fprintf(fp, "miss "); if (tb[NDA_DST]) { From 6f07f3dc41545657c0364eb17850b946f41861bf Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 16 Oct 2015 12:38:33 +0200 Subject: [PATCH 021/151] ip-address: fix oneline mode for interfaces with VF Signed-off-by: Phil Sutter --- ip/ipaddress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index e864ca65..f290205b 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -345,7 +345,7 @@ static void print_vfinfo(FILE *fp, struct rtattr *vfinfo) } else vf_linkstate = NULL; - fprintf(fp, "\n vf %d MAC %s", vf_mac->vf, + fprintf(fp, "%s vf %d MAC %s", _SL_, vf_mac->vf, ll_addr_n2a((unsigned char *)&vf_mac->mac, ETH_ALEN, 0, b1, sizeof(b1))); if (vf_vlan->vlan) From e569c5c0fd1ecb6547613e5b1e889842b226b4e9 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 16 Oct 2015 16:07:03 -0700 Subject: [PATCH 022/151] add tunnel header files from net-next uapi Files needed for new lwtunnel code. --- include/linux/lwtunnel.h | 43 +++++++++++++++++++++++++++++++++++ include/linux/mpls_iptunnel.h | 28 +++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 include/linux/lwtunnel.h create mode 100644 include/linux/mpls_iptunnel.h diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h new file mode 100644 index 00000000..1d2f4f6c --- /dev/null +++ b/include/linux/lwtunnel.h @@ -0,0 +1,43 @@ +#ifndef _LWTUNNEL_H_ +#define _LWTUNNEL_H_ + +#include + +enum lwtunnel_encap_types { + LWTUNNEL_ENCAP_NONE, + LWTUNNEL_ENCAP_MPLS, + LWTUNNEL_ENCAP_IP, + LWTUNNEL_ENCAP_ILA, + LWTUNNEL_ENCAP_IP6, + __LWTUNNEL_ENCAP_MAX, +}; + +#define LWTUNNEL_ENCAP_MAX (__LWTUNNEL_ENCAP_MAX - 1) + +enum lwtunnel_ip_t { + LWTUNNEL_IP_UNSPEC, + LWTUNNEL_IP_ID, + LWTUNNEL_IP_DST, + LWTUNNEL_IP_SRC, + LWTUNNEL_IP_TTL, + LWTUNNEL_IP_TOS, + LWTUNNEL_IP_FLAGS, + __LWTUNNEL_IP_MAX, +}; + +#define LWTUNNEL_IP_MAX (__LWTUNNEL_IP_MAX - 1) + +enum lwtunnel_ip6_t { + LWTUNNEL_IP6_UNSPEC, + LWTUNNEL_IP6_ID, + LWTUNNEL_IP6_DST, + LWTUNNEL_IP6_SRC, + LWTUNNEL_IP6_HOPLIMIT, + LWTUNNEL_IP6_TC, + LWTUNNEL_IP6_FLAGS, + __LWTUNNEL_IP6_MAX, +}; + +#define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1) + +#endif /* _LWTUNNEL_H_ */ diff --git a/include/linux/mpls_iptunnel.h b/include/linux/mpls_iptunnel.h new file mode 100644 index 00000000..4132c3c5 --- /dev/null +++ b/include/linux/mpls_iptunnel.h @@ -0,0 +1,28 @@ +/* + * mpls tunnel api + * + * Authors: + * Roopa Prabhu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _LINUX_MPLS_IPTUNNEL_H +#define _LINUX_MPLS_IPTUNNEL_H + +/* MPLS tunnel attributes + * [RTA_ENCAP] = { + * [MPLS_IPTUNNEL_DST] + * } + */ +enum { + MPLS_IPTUNNEL_UNSPEC, + MPLS_IPTUNNEL_DST, + __MPLS_IPTUNNEL_MAX, +}; +#define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1) + +#endif /* _LINUX_MPLS_IPTUNNEL_H */ From 1e5293056a02cb9f0dfb2c87e503e9f5acef16e2 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 15 Oct 2015 13:13:39 +0200 Subject: [PATCH 023/151] lwtunnel: Add encapsulation support to ip route This patch adds support to parse and print lwtunnel encapsulation attributes attached to routes for MPLS and IP tunnels. example: Add ipv4 route with mpls encap attributes: Examples: MPLS: $ ip route add 40.1.2.0/30 encap mpls 200 via inet 40.1.1.1 dev eth3 $ ip route show 40.1.2.0/30 encap mpls 200 via 40.1.1.1 dev eth3 Add ipv4 multipath route with mpls encap attributes: $ ip route add 10.1.1.0/30 nexthop encap mpls 200 via 10.1.1.1 dev eth0 \ nexthop encap mpls 700 via 40.1.1.2 dev eth3 $ ip route show 10.1.1.0/30 nexthop encap mpls 200 via 10.1.1.1 dev eth0 weight 1 nexthop encap mpls 700 via 40.1.1.2 dev eth3 weight 1 IP: $ ip route add 10.1.1.1/24 encap ip id 200 dst 20.1.1.1 dev vxlan0 Signed-off-by: Roopa Prabhu Signed-off-by: Thomas Graf Acked-by: Jiri Benc --- include/utils.h | 3 + ip/Makefile | 2 +- ip/iproute.c | 39 +++++++- ip/iproute_lwtunnel.c | 228 ++++++++++++++++++++++++++++++++++++++++++ ip/iproute_lwtunnel.h | 8 ++ 5 files changed, 274 insertions(+), 6 deletions(-) create mode 100644 ip/iproute_lwtunnel.c create mode 100644 ip/iproute_lwtunnel.h diff --git a/include/utils.h b/include/utils.h index 668d159c..1d351490 100644 --- a/include/utils.h +++ b/include/utils.h @@ -192,6 +192,9 @@ void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr *n); __attribute__ ((format (printf, (pos_str), (pos_args)))) #endif +#define htonll(x) ((1==htonl(1)) ? (x) : ((uint64_t)htonl((x) & 0xFFFFFFFF) << 32) | htonl((x) >> 32)) +#define ntohll(x) ((1==ntohl(1)) ? (x) : ((uint64_t)ntohl((x) & 0xFFFFFFFF) << 32) | ntohl((x) >> 32)) + extern int cmdlineno; ssize_t getcmdline(char **line, size_t *len, FILE *in); int makeargs(char *line, char *argv[], int maxargs); diff --git a/ip/Makefile b/ip/Makefile index 52b76efb..f3d29873 100644 --- a/ip/Makefile +++ b/ip/Makefile @@ -7,7 +7,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \ iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o \ link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \ iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o \ - iplink_geneve.o iplink_vrf.o + iplink_geneve.o iplink_vrf.o iproute_lwtunnel.o RTMONOBJ=rtmon.o diff --git a/ip/iproute.c b/ip/iproute.c index b0cd299e..ce90895b 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -29,6 +29,7 @@ #include "rt_names.h" #include "utils.h" #include "ip_common.h" +#include "iproute_lwtunnel.h" #ifndef RTAX_RTTVAR #define RTAX_RTTVAR RTAX_HOPS @@ -76,7 +77,8 @@ static void usage(void) fprintf(stderr, " [ table TABLE_ID ] [ proto RTPROTO ]\n"); fprintf(stderr, " [ scope SCOPE ] [ metric METRIC ]\n"); fprintf(stderr, "INFO_SPEC := NH OPTIONS FLAGS [ nexthop NH ]...\n"); - fprintf(stderr, "NH := [ via [ FAMILY ] ADDRESS ] [ dev STRING ] [ weight NUMBER ] NHFLAGS\n"); + fprintf(stderr, "NH := [ encap ENCAPTYPE ENCAPHDR ] [ via [ FAMILY ] ADDRESS ]\n"); + fprintf(stderr, " [ dev STRING ] [ weight NUMBER ] NHFLAGS\n"); fprintf(stderr, "FAMILY := [ inet | inet6 | ipx | dnet | mpls | bridge | link ]\n"); fprintf(stderr, "OPTIONS := FLAGS [ mtu NUMBER ] [ advmss NUMBER ] [ as [ to ] ADDRESS ]\n"); fprintf(stderr, " [ rtt TIME ] [ rttvar TIME ] [ reordering NUMBER ]\n"); @@ -95,6 +97,8 @@ static void usage(void) fprintf(stderr, "TIME := NUMBER[s|ms]\n"); fprintf(stderr, "BOOL := [1|0]\n"); fprintf(stderr, "FEATURES := ecn\n"); + fprintf(stderr, "ENCAPTYPE := [ mpls | ip | ip6 ]\n"); + fprintf(stderr, "ENCAPHDR := [ MPLSLABEL ]\n"); exit(-1); } @@ -401,6 +405,10 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) abuf, sizeof(abuf)) ); } + + if (tb[RTA_ENCAP]) + lwt_print_encap(fp, tb[RTA_ENCAP_TYPE], tb[RTA_ENCAP]); + if (r->rtm_tos && filter.tosmask != -1) { SPRINT_BUF(b1); fprintf(fp, "tos %s ", rtnl_dsfield_n2a(r->rtm_tos, b1, sizeof(b1))); @@ -633,6 +641,12 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) fprintf(fp, "%s\tnexthop", _SL_); if (nh->rtnh_len > sizeof(*nh)) { parse_rtattr(tb, RTA_MAX, RTNH_DATA(nh), nh->rtnh_len - sizeof(*nh)); + + if (tb[RTA_ENCAP]) + lwt_print_encap(fp, + tb[RTA_ENCAP_TYPE], + tb[RTA_ENCAP]); + if (tb[RTA_GATEWAY]) { fprintf(fp, " via %s ", format_host(r->rtm_family, @@ -704,9 +718,8 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) return 0; } - -static int parse_one_nh(struct rtmsg *r, struct rtattr *rta, - struct rtnexthop *rtnh, +static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r, + struct rtattr *rta, struct rtnexthop *rtnh, int *argcp, char ***argvp) { int argc = *argcp; @@ -753,6 +766,11 @@ static int parse_one_nh(struct rtmsg *r, struct rtattr *rta, invarg("\"realm\" value is invalid\n", *argv); rta_addattr32(rta, 4096, RTA_FLOW, realm); rtnh->rtnh_len += sizeof(struct rtattr) + 4; + } else if (strcmp(*argv, "encap") == 0) { + int len = rta->rta_len; + + lwt_parse_encap(rta, 4096, &argc, &argv); + rtnh->rtnh_len += rta->rta_len - len; } else break; } @@ -784,7 +802,7 @@ static int parse_nexthops(struct nlmsghdr *n, struct rtmsg *r, memset(rtnh, 0, sizeof(*rtnh)); rtnh->rtnh_len = sizeof(*rtnh); rta->rta_len += rtnh->rtnh_len; - parse_one_nh(r, rta, rtnh, &argc, &argv); + parse_one_nh(n, r, rta, rtnh, &argc, &argv); rtnh = RTNH_NEXT(rtnh); } @@ -1092,6 +1110,17 @@ static int iproute_modify(int cmd, unsigned flags, int argc, char **argv) else if (get_u8(&pref, *argv, 0)) invarg("\"pref\" value is invalid\n", *argv); addattr8(&req.n, sizeof(req), RTA_PREF, pref); + } else if (strcmp(*argv, "encap") == 0) { + char buf[1024]; + struct rtattr *rta = (void*)buf; + + rta->rta_type = RTA_ENCAP; + rta->rta_len = RTA_LENGTH(0); + + lwt_parse_encap(rta, sizeof(buf), &argc, &argv); + + if (rta->rta_len > RTA_LENGTH(0)) + addraw_l(&req.n, 1024, RTA_DATA(rta), RTA_PAYLOAD(rta)); } else { int type; inet_prefix dst; diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c new file mode 100644 index 00000000..63322a18 --- /dev/null +++ b/ip/iproute_lwtunnel.c @@ -0,0 +1,228 @@ +/* + * iproute_lwtunnel.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Roopa Prabhu, + * Thomas Graf + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rt_names.h" +#include "utils.h" +#include "iproute_lwtunnel.h" + +static int read_encap_type(const char *name) +{ + if (strcmp(name, "mpls") == 0) + return LWTUNNEL_ENCAP_MPLS; + else if (strcmp(name, "ip") == 0) + return LWTUNNEL_ENCAP_IP; + else if (strcmp(name, "ip6") == 0) + return LWTUNNEL_ENCAP_IP6; + else + return LWTUNNEL_ENCAP_NONE; +} + +static const char *format_encap_type(int type) +{ + switch (type) { + case LWTUNNEL_ENCAP_MPLS: + return "mpls"; + case LWTUNNEL_ENCAP_IP: + return "ip"; + case LWTUNNEL_ENCAP_IP6: + return "ip6"; + default: + return "unknown"; + } +} + +static void print_encap_mpls(FILE *fp, struct rtattr *encap) +{ + struct rtattr *tb[MPLS_IPTUNNEL_MAX+1]; + char abuf[256]; + + parse_rtattr_nested(tb, MPLS_IPTUNNEL_MAX, encap); + + if (tb[MPLS_IPTUNNEL_DST]) + fprintf(fp, " %s ", format_host(AF_MPLS, + RTA_PAYLOAD(tb[MPLS_IPTUNNEL_DST]), + RTA_DATA(tb[MPLS_IPTUNNEL_DST]), + abuf, sizeof(abuf))); +} + +static void print_encap_ip(FILE *fp, struct rtattr *encap) +{ + struct rtattr *tb[LWTUNNEL_IP_MAX+1]; + char abuf[256]; + + parse_rtattr_nested(tb, LWTUNNEL_IP_MAX, encap); + + if (tb[LWTUNNEL_IP_ID]) + fprintf(fp, "id %llu ", ntohll(rta_getattr_u64(tb[LWTUNNEL_IP_ID]))); + + if (tb[LWTUNNEL_IP_SRC]) + fprintf(fp, "src %s ", + rt_addr_n2a(AF_INET, + RTA_PAYLOAD(tb[LWTUNNEL_IP_SRC]), + RTA_DATA(tb[LWTUNNEL_IP_SRC]), + abuf, sizeof(abuf))); + + if (tb[LWTUNNEL_IP_DST]) + fprintf(fp, "dst %s ", + rt_addr_n2a(AF_INET, + RTA_PAYLOAD(tb[LWTUNNEL_IP_DST]), + RTA_DATA(tb[LWTUNNEL_IP_DST]), + abuf, sizeof(abuf))); + + if (tb[LWTUNNEL_IP_TTL]) + fprintf(fp, "ttl %d ", rta_getattr_u8(tb[LWTUNNEL_IP_TTL])); + + if (tb[LWTUNNEL_IP_TOS]) + fprintf(fp, "tos %d ", rta_getattr_u8(tb[LWTUNNEL_IP_TOS])); +} + +void lwt_print_encap(FILE *fp, struct rtattr *encap_type, + struct rtattr *encap) +{ + int et; + + if (!encap_type) + return; + + et = rta_getattr_u16(encap_type); + + fprintf(fp, " encap %s", format_encap_type(et)); + + switch (et) { + case LWTUNNEL_ENCAP_MPLS: + print_encap_mpls(fp, encap); + break; + case LWTUNNEL_ENCAP_IP: + print_encap_ip(fp, encap); + break; + } +} + +static int parse_encap_mpls(struct rtattr *rta, size_t len, int *argcp, char ***argvp) +{ + inet_prefix addr; + int argc = *argcp; + char **argv = *argvp; + + if (get_addr(&addr, *argv, AF_MPLS)) { + fprintf(stderr, "Error: an inet address is expected rather than \"%s\".\n", *argv); + exit(1); + } + + rta_addattr_l(rta, len, MPLS_IPTUNNEL_DST, &addr.data, + addr.bytelen); + + *argcp = argc; + *argvp = argv; + + return 0; +} + +static int parse_encap_ip(struct rtattr *rta, size_t len, int *argcp, char ***argvp) +{ + int id_ok = 0, dst_ok = 0, tos_ok = 0, ttl_ok = 0; + char **argv = *argvp; + int argc = *argcp; + + while (argc > 0) { + if (strcmp(*argv, "id") == 0) { + __u64 id; + NEXT_ARG(); + if (id_ok++) + duparg2("id", *argv); + if (get_u64(&id, *argv, 0)) + invarg("\"id\" value is invalid\n", *argv); + rta_addattr64(rta, len, LWTUNNEL_IP_ID, htonll(id)); + } else if (strcmp(*argv, "dst") == 0) { + inet_prefix addr; + NEXT_ARG(); + if (dst_ok++) + duparg2("dst", *argv); + get_addr(&addr, *argv, AF_INET); + rta_addattr_l(rta, len, LWTUNNEL_IP_DST, &addr.data, addr.bytelen); + } else if (strcmp(*argv, "tos") == 0) { + __u32 tos; + NEXT_ARG(); + if (tos_ok++) + duparg2("tos", *argv); + if (rtnl_dsfield_a2n(&tos, *argv)) + invarg("\"tos\" value is invalid\n", *argv); + rta_addattr8(rta, len, LWTUNNEL_IP_TOS, tos); + } else if (strcmp(*argv, "ttl") == 0) { + __u8 ttl; + NEXT_ARG(); + if (ttl_ok++) + duparg2("ttl", *argv); + if (get_u8(&ttl, *argv, 0)) + invarg("\"ttl\" value is invalid\n", *argv); + rta_addattr8(rta, len, LWTUNNEL_IP_TTL, ttl); + } else { + break; + } + } + + *argcp = argc; + *argvp = argv; + + return 0; +} + + +int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp) +{ + struct rtattr *nest; + int argc = *argcp; + char **argv = *argvp; + __u16 type; + + NEXT_ARG(); + type = read_encap_type(*argv); + if (!type) + invarg("\"encap type\" value is invalid\n", *argv); + + NEXT_ARG(); + if (argc <= 1) { + fprintf(stderr, "Error: unexpected end of line after \"encap\"\n"); + exit(-1); + } + + nest = rta_nest(rta, 1024, RTA_ENCAP); + switch (type) { + case LWTUNNEL_ENCAP_MPLS: + parse_encap_mpls(rta, len, &argc, &argv); + break; + case LWTUNNEL_ENCAP_IP: + parse_encap_ip(rta, len, &argc, &argv); + break; + default: + fprintf(stderr, "Error: unsupported encap type\n"); + break; + } + rta_nest_end(rta, nest); + + rta_addattr16(rta, 1024, RTA_ENCAP_TYPE, type); + + *argcp = argc; + *argvp = argv; + + return 0; +} diff --git a/ip/iproute_lwtunnel.h b/ip/iproute_lwtunnel.h new file mode 100644 index 00000000..b82b58ad --- /dev/null +++ b/ip/iproute_lwtunnel.h @@ -0,0 +1,8 @@ +#ifndef __LWTUNNEL_H__ +#define __LETUNNEL_H__ 1 + +int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp); +void lwt_print_encap(FILE *fp, struct rtattr *encap_type, + struct rtattr *encap); + +#endif From 70e4663472017b627affccbea1570d7ca7736f1c Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 15 Oct 2015 13:13:40 +0200 Subject: [PATCH 024/151] ip-route man: add usage and description for lwtunnel encap attributes This patch updates ip-route man page with lwtunnel encap usage and description, covering MPLS and IP encapsulation. Signed-off-by: Roopa Prabhu Signed-off-by: Thomas Graf Acked-by: Jiri Benc --- man/man8/ip-route.8.in | 70 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in index 72d8d770..9934a1e8 100644 --- a/man/man8/ip-route.8.in +++ b/man/man8/ip-route.8.in @@ -80,6 +80,8 @@ replace " } " .ti -8 .IR NH " := [ " +.B encap +.IR ENCAP " ] [ " .B via [ .IR FAMILY " ] " ADDRESS " ] [ " @@ -164,6 +166,26 @@ throw " | " unreachable " | " prohibit " | " blackhole " | " nat " ]" .IR PREF " := [ " .BR low " | " medium " | " high " ]" +.ti -8 +.IR ENCAP " := [ " +.IR MPLS " | " IP " ]" + +.ti -8 +.IR ENCAP_MPLS " := " +.BR mpls " [ " +.IR LABEL " ]" + +.ti -8 +.IR ENCAP_IP " := " +.B ip +.B id +.IR TUNNEL_ID +.B dst +.IR REMOTE_IP " [ " +.B tos +.IR TOS " ] [" +.B ttl +.IR TTL " ]" .SH DESCRIPTION .B ip route @@ -589,6 +611,48 @@ Discovery messages. Namely: - the route has a highest priority .sp +.TP +.BI encap " ENCAPTYPE ENCAPHDR" +attach tunnel encapsulation attributes to this route. +.sp +.I ENCAPTYPE +is a string specifying the supported encapsulation type. Namely: + +.in +8 +.BI mpls +- encapsulation type MPLS +.sp +.BI ip +- IP encapsulation (Geneve, GRE, VXLAN, ...) +.sp + +.in -8 +.I ENCAPHDR +is a set of encapsulation attributes specific to the +.I ENCAPTYPE. + +.in +8 +.B mpls +.in +2 +.I MPLSLABEL +- mpls label stack with labels separated by +.I "/" +.in -2 +.sp + +.B ip +.in +2 +.B id +.I TUNNEL_ID +.B dst +.IR REMOTE_IP " [ " +.B tos +.IR TOS " ] [" +.B ttl +.IR TTL " ]" +.in -2 +.sp + .in -8 .RE @@ -847,7 +911,11 @@ ip route add default via 192.168.1.1 dev eth0 Adds a default route (for all addresses) via the local gateway 192.168.1.1 that can be reached on device eth0. .RE - +.PP +ip route add 10.1.1.0/30 encap mpls 200/300 via 10.1.1.1 dev eth0 +.RS 4 +Adds an ipv4 route with mpls encapsulation attributes attached to it. +.RE .SH SEE ALSO .br .BR ip (8) From f73105ab42795422f131fb711712e3c860df982f Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 15 Oct 2015 21:01:16 +0200 Subject: [PATCH 025/151] ss: return -1 if an unrecognized option was given When getopt_long encounters an option which has not been registered, it returns '?'. React upon that and call usage() instead of help() so ss returns with a non-zero exit status. Signed-off-by: Phil Sutter --- misc/ss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misc/ss.c b/misc/ss.c index eca4aa35..a9ae85ec 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -3770,8 +3770,8 @@ int main(int argc, char *argv[]) exit(1); break; case 'h': - case '?': help(); + case '?': default: usage(); } From ccaf6eb5cc1020c7fe8997b8cadf9a12cf92302a Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 15 Oct 2015 22:32:17 +0200 Subject: [PATCH 026/151] ip-rule: neither prohibit nor reject or unreachable flags exist This has been inconsistent since the beginning of Git and seems to be merely a documentation leftover, therefore just remove it from help output and man page. Signed-off-by: Phil Sutter --- ip/iprule.c | 1 - man/man8/ip-rule.8 | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ip/iprule.c b/ip/iprule.c index 714278a2..2fa9ade9 100644 --- a/ip/iprule.c +++ b/ip/iprule.c @@ -36,7 +36,6 @@ static void usage(void) fprintf(stderr, "SELECTOR := [ not ] [ from PREFIX ] [ to PREFIX ] [ tos TOS ] [ fwmark FWMARK[/MASK] ]\n"); fprintf(stderr, " [ iif STRING ] [ oif STRING ] [ pref NUMBER ]\n"); fprintf(stderr, "ACTION := [ table TABLE_ID ]\n"); - fprintf(stderr, " [ prohibit | reject | unreachable ]\n"); fprintf(stderr, " [ realms [SRCREALM/]DSTREALM ]\n"); fprintf(stderr, " [ goto NUMBER ]\n"); fprintf(stderr, " SUPPRESSOR\n"); diff --git a/man/man8/ip-rule.8 b/man/man8/ip-rule.8 index dd925be6..6245d8cf 100644 --- a/man/man8/ip-rule.8 +++ b/man/man8/ip-rule.8 @@ -41,7 +41,7 @@ ip-rule \- routing policy database management .IR TABLE_ID " ] [ " .B nat .IR ADDRESS " ] [ " -.BR prohibit " | " reject " | " unreachable " ] [ " realms +.B realms .RI "[" SRCREALM "/]" DSTREALM " ]" .I SUPPRESSOR From 4d45bf3bafa6960720193dbf57042a3b43b73868 Mon Sep 17 00:00:00 2001 From: Wilson Kok Date: Thu, 15 Oct 2015 14:53:17 -0700 Subject: [PATCH 027/151] bridge: add calls to fflush in fdb and mdb print functions This patch adds fflush in fdb and mdb print functions Signed-off-by: Wilson Kok Signed-off-by: Roopa Prabhu --- bridge/fdb.c | 2 ++ bridge/mdb.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/bridge/fdb.c b/bridge/fdb.c index bd7e4f92..5ea50abb 100644 --- a/bridge/fdb.c +++ b/bridge/fdb.c @@ -163,6 +163,8 @@ int print_fdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) fprintf(fp, "offload "); fprintf(fp, "%s\n", state_n2a(r->ndm_state)); + fflush(fp); + return 0; } diff --git a/bridge/mdb.c b/bridge/mdb.c index b14bd019..24c49035 100644 --- a/bridge/mdb.c +++ b/bridge/mdb.c @@ -135,6 +135,8 @@ int print_mdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) } } + fflush(fp); + return 0; } From 8b21cef12943cab841ba9adabb8ce2360b04c65e Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 15 Oct 2015 15:23:50 -0700 Subject: [PATCH 028/151] ip route get: change exit to return to support batch commands replace exit with return -2 on rtnl_talk failure Signed-off-by: Roopa Prabhu --- ip/iproute.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/iproute.c b/ip/iproute.c index da25548c..b137f555 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -1643,7 +1643,7 @@ static int iproute_get(int argc, char **argv) req.r.rtm_family = AF_INET; if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0) - exit(2); + return -2; if (connected && !from_ok) { struct rtmsg *r = NLMSG_DATA(&req.n); From 2f4e171f7df22107b38fddcffa56c1ecb5e73359 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 20 Oct 2015 13:41:48 +0300 Subject: [PATCH 029/151] Add ip rule save/restore This patch adds save and restore commands to "ip rule" similar the same is made in commit f4ff11e3e298 for "ip route". The feature is useful in checkpoint/restore for container migration, also it may be helpful in some normal situations. Signed-off-by: Kirill Tkhai --- doc/ip-cref.tex | 36 ++++++++++++++++ ip/iprule.c | 103 ++++++++++++++++++++++++++++++++++++++++++--- man/man8/ip-rule.8 | 26 +++++++++++- 3 files changed, 158 insertions(+), 7 deletions(-) diff --git a/doc/ip-cref.tex b/doc/ip-cref.tex index ea147950..67094c95 100644 --- a/doc/ip-cref.tex +++ b/doc/ip-cref.tex @@ -2246,6 +2246,42 @@ Besides that, the host 193.233.7.83 is translated into another prefix to look like 192.203.80.144 when talking to the outer world. +\subsection{{\tt ip rule save} -- save rules tables} +\label{IP-RULE-SAVE} + +\paragraph{Description:} this command saves the contents of the rules +tables or the rule(s) selected by some criteria to standard output. + +\paragraph{Arguments:} \verb|ip rule save| has the same arguments as +\verb|ip rule show|. + +\paragraph{Example:} This saves all the rules to the {\tt saved\_rules} +file: +\begin{verbatim} +dan@caffeine:~ # ip rule save > saved_rules +\end{verbatim} + +\paragraph{Output format:} The format of the data stream provided by +\verb|ip rule save| is that of \verb|rtnetlink|. See +\verb|rtnetlink(7)| for more information. + +\subsection{{\tt ip rule restore} -- restore rules tables} +\label{IP-RULE-RESTORE} + +\paragraph{Description:} this command restores the contents of the rules +tables according to a data stream as provided by \verb|ip rule save| via +standard input. Note that any rules already in the table are left unchanged, +and duplicates are not ignored. + +\paragraph{Arguments:} This command takes no arguments. + +\paragraph{Example:} This restores all rules that were saved to the +{\tt saved\_rules} file: + +\begin{verbatim} +dan@caffeine:~ # ip rule restore < saved_rules +\end{verbatim} + \section{{\tt ip maddress} --- multicast addresses management} diff --git a/ip/iprule.c b/ip/iprule.c index 2fa9ade9..cec29246 100644 --- a/ip/iprule.c +++ b/ip/iprule.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "rt_names.h" #include "utils.h" @@ -32,7 +33,8 @@ static void usage(void) __attribute__((noreturn)); static void usage(void) { - fprintf(stderr, "Usage: ip rule [ list | add | del | flush ] SELECTOR ACTION\n"); + fprintf(stderr, "Usage: ip rule [ list | add | del | flush | save ] SELECTOR ACTION\n"); + fprintf(stderr, " ip rule restore\n"); fprintf(stderr, "SELECTOR := [ not ] [ from PREFIX ] [ to PREFIX ] [ tos TOS ] [ fwmark FWMARK[/MASK] ]\n"); fprintf(stderr, " [ iif STRING ] [ oif STRING ] [ pref NUMBER ]\n"); fprintf(stderr, "ACTION := [ table TABLE_ID ]\n"); @@ -205,24 +207,65 @@ int print_rule(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) return 0; } -static int iprule_list(int argc, char **argv) +static __u32 rule_dump_magic = 0x71706986; + +static int save_rule_prep(void) { + int ret; + + if (isatty(STDOUT_FILENO)) { + fprintf(stderr, "Not sending a binary stream to stdout\n"); + return -1; + } + + ret = write(STDOUT_FILENO, &rule_dump_magic, sizeof(rule_dump_magic)); + if (ret != sizeof(rule_dump_magic)) { + fprintf(stderr, "Can't write magic to dump file\n"); + return -1; + } + + return 0; +} + +static int save_rule(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + int ret; + + ret = write(STDOUT_FILENO, n, n->nlmsg_len); + if ((ret > 0) && (ret != n->nlmsg_len)) { + fprintf(stderr, "Short write while saving nlmsg\n"); + ret = -EIO; + } + + return ret == n->nlmsg_len ? 0 : ret; +} + +static int iprule_list_or_save(int argc, char **argv, int save) +{ + rtnl_filter_t filter = print_rule; int af = preferred_family; if (af == AF_UNSPEC) af = AF_INET; if (argc > 0) { - fprintf(stderr, "\"ip rule show\" does not take any arguments.\n"); + fprintf(stderr, "\"ip rule %s\" does not take any arguments.\n", + save ? "save" : "show"); return -1; } + if (save) { + if (save_rule_prep()) + return -1; + filter = save_rule; + } + if (rtnl_wilddump_request(&rth, af, RTM_GETRULE) < 0) { perror("Cannot send dump request"); return 1; } - if (rtnl_dump_filter(&rth, print_rule, stdout) < 0) { + if (rtnl_dump_filter(&rth, filter, stdout) < 0) { fprintf(stderr, "Dump terminated\n"); return 1; } @@ -230,6 +273,50 @@ static int iprule_list(int argc, char **argv) return 0; } +static int rule_dump_check_magic(void) +{ + int ret; + __u32 magic = 0; + + if (isatty(STDIN_FILENO)) { + fprintf(stderr, "Can't restore rule dump from a terminal\n"); + return -1; + } + + ret = fread(&magic, sizeof(magic), 1, stdin); + if (magic != rule_dump_magic) { + fprintf(stderr, "Magic mismatch (%d elems, %x magic)\n", ret, magic); + return -1; + } + + return 0; +} + +static int restore_handler(const struct sockaddr_nl *nl, + struct rtnl_ctrl_data *ctrl, + struct nlmsghdr *n, void *arg) +{ + int ret; + + n->nlmsg_flags |= NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK; + + ll_init_map(&rth); + + ret = rtnl_talk(&rth, n, n, sizeof(*n)); + if ((ret < 0) && (errno == EEXIST)) + ret = 0; + + return ret; +} + + +static int iprule_restore(void) +{ + if (rule_dump_check_magic()) + exit(-1); + + exit(rtnl_from_file(stdin, &restore_handler, NULL)); +} static int iprule_modify(int cmd, int argc, char **argv) { @@ -443,11 +530,15 @@ static int iprule_flush(int argc, char **argv) int do_iprule(int argc, char **argv) { if (argc < 1) { - return iprule_list(0, NULL); + return iprule_list_or_save(0, NULL, 0); } else if (matches(argv[0], "list") == 0 || matches(argv[0], "lst") == 0 || matches(argv[0], "show") == 0) { - return iprule_list(argc-1, argv+1); + return iprule_list_or_save(argc-1, argv+1, 0); + } else if (matches(argv[0], "save") == 0) { + return iprule_list_or_save(argc-1, argv+1, 1); + } else if (matches(argv[0], "restore") == 0) { + return iprule_restore(); } else if (matches(argv[0], "add") == 0) { return iprule_modify(RTM_NEWRULE, argc-1, argv+1); } else if (matches(argv[0], "delete") == 0) { diff --git a/man/man8/ip-rule.8 b/man/man8/ip-rule.8 index 6245d8cf..b1d03e79 100644 --- a/man/man8/ip-rule.8 +++ b/man/man8/ip-rule.8 @@ -15,9 +15,12 @@ ip-rule \- routing policy database management .ti -8 .B ip rule -.RB " [ " list " | " add " | " del " | " flush " ]" +.RB " [ " list " | " add " | " del " | " flush " | " save " ]" .I SELECTOR ACTION +.ti -8 +.B ip rule " restore " + .ti -8 .IR SELECTOR " := [ " .B from @@ -265,6 +268,27 @@ This command has no arguments. This command has no arguments. The options list or lst are synonyms with show. +.TP +.B ip rule save +save rules table information to stdout +.RS +This command behaves like +.BR "ip rule show" +except that the output is raw data suitable for passing to +.BR "ip rule restore" . +.RE + +.TP +.B ip rule restore +restore rules table information from stdin +.RS +This command expects to read a data stream as returned from +.BR "ip rule save" . +It will attempt to restore the rules table information exactly as +it was at the time of the save. Any rules already in the table are +left unchanged, and duplicates are not ignored. +.RE + .SH SEE ALSO .br .BR ip (8) From 89bb4c6aca8a3b8fe009ece35f4b7380a28ef44d Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 22 Oct 2015 23:36:49 -0700 Subject: [PATCH 030/151] update kernel headers Track upstream --- include/linux/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 3fe10b05..32449478 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -160,7 +160,7 @@ struct rtattr { /* Macros to handle rtattributes */ -#define RTA_ALIGNTO 4 +#define RTA_ALIGNTO 4U #define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) ) #define RTA_OK(rta,len) ((len) >= (int)sizeof(struct rtattr) && \ (rta)->rta_len >= sizeof(struct rtattr) && \ From d583e88ebc859d6ef5bddffb098fa95158d55c75 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 8 Oct 2015 12:22:39 +0200 Subject: [PATCH 031/151] ip, realms: also allow to pass in raw realms value If get_rt_realms() fails, try to get a possible raw u32 realms value for the u32 RTA_FLOW/FRA_FLOW attribute, as it might be useful to directly configure the hex value itself. And only if that fails, then bail out. The source realm is provided in the upper u16 (mask: 0xffff0000) and the destination realm through the lower u16 part (mask: 0x0000ffff). This can be useful for tc's bpf realm matcher, but also a full hex/mask param can be provided already for matching through iptables' --realm cmdline option, for example. Signed-off-by: Daniel Borkmann --- include/rtm_map.h | 3 +-- ip/iproute.c | 6 +++--- ip/iprule.c | 2 +- ip/rtm_map.c | 10 +++++++++- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/rtm_map.h b/include/rtm_map.h index 70bda7d0..d6e5885c 100644 --- a/include/rtm_map.h +++ b/include/rtm_map.h @@ -4,7 +4,6 @@ char *rtnl_rtntype_n2a(int id, char *buf, int len); int rtnl_rtntype_a2n(int *id, char *arg); -int get_rt_realms(__u32 *realms, char *arg); - +int get_rt_realms_or_raw(__u32 *realms, char *arg); #endif /* __RTM_MAP_H__ */ diff --git a/ip/iproute.c b/ip/iproute.c index b137f555..ae86cc0d 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -749,7 +749,7 @@ static int parse_one_nh(struct rtmsg *r, struct rtattr *rta, } else if (matches(*argv, "realms") == 0) { __u32 realm; NEXT_ARG(); - if (get_rt_realms(&realm, *argv)) + if (get_rt_realms_or_raw(&realm, *argv)) invarg("\"realm\" value is invalid\n", *argv); rta_addattr32(rta, 4096, RTA_FLOW, realm); rtnh->rtnh_len += sizeof(struct rtattr) + 4; @@ -1050,7 +1050,7 @@ static int iproute_modify(int cmd, unsigned flags, int argc, char **argv) } else if (matches(*argv, "realms") == 0) { __u32 realm; NEXT_ARG(); - if (get_rt_realms(&realm, *argv)) + if (get_rt_realms_or_raw(&realm, *argv)) invarg("\"realm\" value is invalid\n", *argv); addattr32(&req.n, sizeof(req), RTA_FLOW, realm); } else if (strcmp(*argv, "onlink") == 0) { @@ -1383,7 +1383,7 @@ static int iproute_list_flush_or_save(int argc, char **argv, int action) } else if (matches(*argv, "realms") == 0) { __u32 realm; NEXT_ARG(); - if (get_rt_realms(&realm, *argv)) + if (get_rt_realms_or_raw(&realm, *argv)) invarg("invalid realms\n", *argv); filter.realm = realm; filter.realmmask = ~0U; diff --git a/ip/iprule.c b/ip/iprule.c index cec29246..9923b8eb 100644 --- a/ip/iprule.c +++ b/ip/iprule.c @@ -391,7 +391,7 @@ static int iprule_modify(int cmd, int argc, char **argv) } else if (matches(*argv, "realms") == 0) { __u32 realm; NEXT_ARG(); - if (get_rt_realms(&realm, *argv)) + if (get_rt_realms_or_raw(&realm, *argv)) invarg("invalid realms\n", *argv); addattr32(&req.n, sizeof(req), FRA_FLOW, realm); } else if (matches(*argv, "table") == 0 || diff --git a/ip/rtm_map.c b/ip/rtm_map.c index 21e818b4..1d7d2c7e 100644 --- a/ip/rtm_map.c +++ b/ip/rtm_map.c @@ -93,7 +93,7 @@ int rtnl_rtntype_a2n(int *id, char *arg) return 0; } -int get_rt_realms(__u32 *realms, char *arg) +static int get_rt_realms(__u32 *realms, char *arg) { __u32 realm = 0; char *p = strchr(arg, '/'); @@ -114,3 +114,11 @@ int get_rt_realms(__u32 *realms, char *arg) *realms |= realm; return 0; } + +int get_rt_realms_or_raw(__u32 *realms, char *arg) +{ + if (!get_rt_realms(realms, arg)) + return 0; + + return get_unsigned(realms, arg, 0); +} From c518d3a7f73a441b6cfa283244bd829509f4b0f4 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 22 Oct 2015 23:43:35 -0700 Subject: [PATCH 032/151] update bpf kernel header --- include/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 689be2e8..37ba6017 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -287,6 +287,17 @@ enum bpf_func_id { * Return: realm if != 0 */ BPF_FUNC_get_route_realm, + + /** + * bpf_perf_event_output(ctx, map, index, data, size) - output perf raw sample + * @ctx: struct pt_regs* + * @map: pointer to perf_event_array map + * @index: index of event in the map + * @data: data on stack to be output as raw data + * @size: size of data + * Return: 0 on success + */ + BPF_FUNC_perf_event_output, __BPF_FUNC_MAX_ID, }; From 0a83e1eaf7b3c2ed9e6d809bd7aac7946f9d8d87 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 23 Oct 2015 19:21:17 +0200 Subject: [PATCH 033/151] tc: improve filter help texts a bit This fixes a few syntax errors and changes route filter help text to use classid instead of flowid to be consistent with other filters' help texts. Signed-off-by: Phil Sutter --- tc/f_flower.c | 4 ++-- tc/f_route.c | 2 +- tc/f_rsvp.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tc/f_flower.c b/tc/f_flower.c index 9a5ea062..a9b2c4df 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -28,7 +28,7 @@ static void explain(void) fprintf(stderr, " [ action ACTION-SPEC ] [ classid CLASSID ]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Where: MATCH-LIST := [ MATCH-LIST ] MATCH\n"); - fprintf(stderr, " MATCH := [ indev DEV-NAME | \n"); + fprintf(stderr, " MATCH := { indev DEV-NAME | \n"); fprintf(stderr, " dst_mac MAC-ADDR | \n"); fprintf(stderr, " src_mac MAC-ADDR | \n"); fprintf(stderr, " eth_type [ipv4 | ipv6 | ETH-TYPE ] | \n"); @@ -36,7 +36,7 @@ static void explain(void) fprintf(stderr, " dst_ip [ IPV4-ADDR | IPV6-ADDR ] | \n"); fprintf(stderr, " src_ip [ IPV4-ADDR | IPV6-ADDR ] | \n"); fprintf(stderr, " dst_port PORT-NUMBER | \n"); - fprintf(stderr, " src_port PORT-NUMBER | \n"); + fprintf(stderr, " src_port PORT-NUMBER }\n"); fprintf(stderr, " FILTERID := X:Y:Z\n"); fprintf(stderr, " ACTION-SPEC := ... look at individual actions\n"); fprintf(stderr, "\n"); diff --git a/tc/f_route.c b/tc/f_route.c index 23c4ecc7..4e9032c5 100644 --- a/tc/f_route.c +++ b/tc/f_route.c @@ -28,7 +28,7 @@ static void explain(void) { fprintf(stderr, "Usage: ... route [ from REALM | fromif TAG ] [ to REALM ]\n"); - fprintf(stderr, " [ flowid CLASSID ] [ action ACTION_SPEC ]]\n"); + fprintf(stderr, " [ classid CLASSID ] [ action ACTION_SPEC ]\n"); fprintf(stderr, " ACTION_SPEC := ... look at individual actions\n"); fprintf(stderr, " CLASSID := X:Y\n"); fprintf(stderr, "\nNOTE: CLASSID is parsed as hexadecimal input.\n"); diff --git a/tc/f_rsvp.c b/tc/f_rsvp.c index cb7b8fba..1fe9b15f 100644 --- a/tc/f_rsvp.c +++ b/tc/f_rsvp.c @@ -27,7 +27,7 @@ static void explain(void) { fprintf(stderr, "Usage: ... rsvp ipproto PROTOCOL session DST[/PORT | GPI ]\n"); - fprintf(stderr, " [ sender SRC[/PORT | GPI ]\n"); + fprintf(stderr, " [ sender SRC[/PORT | GPI ] ]\n"); fprintf(stderr, " [ classid CLASSID ] [ action ACTION_SPEC ]\n"); fprintf(stderr, " [ tunnelid ID ] [ tunnel ID skip NUMBER ]\n"); fprintf(stderr, "Where: GPI := { flowlabel NUMBER | spi/ah SPI | spi/esp SPI |\n"); From 40eb737ebb02c61f1270b963e256a4e9f3a4a52d Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 23 Oct 2015 19:21:23 +0200 Subject: [PATCH 034/151] tc: u32 filter coding style cleanup Add missing spaces around operators to increase readability. Aside from that, make "preference" match a real synonym for "tos" and "dsfield" as it's effect was identical to them. Signed-off-by: Phil Sutter --- tc/f_u32.c | 56 ++++++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/tc/f_u32.c b/tc/f_u32.c index cb63869d..0b976789 100644 --- a/tc/f_u32.c +++ b/tc/f_u32.c @@ -61,14 +61,14 @@ static int get_u32_handle(__u32 *handle, const char *str) if (htid>=0x1000) return -1; if (*tmp) { - str = tmp+1; + str = tmp + 1; hash = strtoul(str, &tmp, 16); if (tmp == str && *str != ':' && *str != 0) return -1; if (hash>=0x100) return -1; if (*tmp) { - str = tmp+1; + str = tmp + 1; nodeid = strtoul(str, &tmp, 16); if (tmp == str && *str != 0) return -1; @@ -124,9 +124,9 @@ static int pack_key(struct tc_u32_sel *sel, __u32 key, __u32 mask, for (i=0; ikeys[i].off == off && sel->keys[i].offmask == offmask) { - __u32 intersect = mask&sel->keys[i].mask; + __u32 intersect = mask & sel->keys[i].mask; - if ((key^sel->keys[i].val) & intersect) + if ((key ^ sel->keys[i].val) & intersect) return -1; sel->keys[i].val |= key; sel->keys[i].mask |= mask; @@ -346,7 +346,7 @@ static int parse_ip_addr(int *argc_p, char ***argv_p, struct tc_u32_sel *sel, mask = 0; if (addr.bitlen) - mask = htonl(0xFFFFFFFF<<(32-addr.bitlen)); + mask = htonl(0xFFFFFFFF << (32 - addr.bitlen)); if (pack_key(sel, addr.data[0], mask, off, offmask) < 0) return -1; res = 0; @@ -381,17 +381,17 @@ static int parse_ip6_addr(int *argc_p, char ***argv_p, } plen = addr.bitlen; - for (i=0; i>16; - hash ^= hash>>8; - htid = ((hash%divisor)<<12)|(htid&0xFFF00000); + hash = sel2.sel.keys[0].val & sel2.sel.keys[0].mask; + hash ^= hash >> 16; + hash ^= hash >> 8; + htid = ((hash % divisor) << 12) | (htid & 0xFFF00000); sample_ok = 1; continue; } else if (strcmp(*argv, "indev") == 0) { @@ -1165,7 +1163,7 @@ static int u32_parse_opt(struct filter_util *qu, char *handle, addattr_l(n, MAX_MSG, TCA_U32_HASH, &htid, 4); if (sel_ok) addattr_l(n, MAX_MSG, TCA_U32_SEL, &sel, - sizeof(sel.sel)+sel.sel.nkeys*sizeof(struct tc_u32_key)); + sizeof(sel.sel) + sel.sel.nkeys * sizeof(struct tc_u32_key)); tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail; return 0; } @@ -1173,7 +1171,7 @@ static int u32_parse_opt(struct filter_util *qu, char *handle, static int u32_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle) { - struct rtattr *tb[TCA_U32_MAX+1]; + struct rtattr *tb[TCA_U32_MAX + 1]; struct tc_u32_sel *sel = NULL; struct tc_u32_pcnt *pf = NULL; @@ -1209,9 +1207,9 @@ static int u32_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, if (tb[TCA_U32_CLASSID]) { SPRINT_BUF(b1); fprintf(f, "%sflowid %s ", - !sel || !(sel->flags&TC_U32_TERMINAL) ? "*" : "", + !sel || !(sel->flags & TC_U32_TERMINAL) ? "*" : "", sprint_tc_classid(rta_getattr_u32(tb[TCA_U32_CLASSID]), b1)); - } else if (sel && sel->flags&TC_U32_TERMINAL) { + } else if (sel && sel->flags & TC_U32_TERMINAL) { fprintf(f, "terminal flowid ??? "); } if (tb[TCA_U32_LINK]) { @@ -1254,16 +1252,16 @@ static int u32_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, } } - if (sel->flags&(TC_U32_VAROFFSET|TC_U32_OFFSET)) { + if (sel->flags & (TC_U32_VAROFFSET | TC_U32_OFFSET)) { fprintf(f, "\n offset "); - if (sel->flags&TC_U32_VAROFFSET) + if (sel->flags & TC_U32_VAROFFSET) fprintf(f, "%04x>>%d at %d ", ntohs(sel->offmask), sel->offshift, sel->offoff); if (sel->off) fprintf(f, "plus %d ", sel->off); } - if (sel->flags&TC_U32_EAT) + if (sel->flags & TC_U32_EAT) fprintf(f, " eat "); if (sel->hmask) { From 55b35567ad05d926c1a73828cef2d4370d928b54 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 23 Oct 2015 19:47:08 +0200 Subject: [PATCH 035/151] tc: add a man page for basic filter Cc: Thomas Graf Signed-off-by: Phil Sutter --- man/man8/tc-basic.8 | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 man/man8/tc-basic.8 diff --git a/man/man8/tc-basic.8 b/man/man8/tc-basic.8 new file mode 100644 index 00000000..fb39eaa9 --- /dev/null +++ b/man/man8/tc-basic.8 @@ -0,0 +1,34 @@ +.TH "Basic classifier in tc" 8 "21 Oct 2015" "iproute2" "Linux" + +.SH NAME +basic \- basic traffic control filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... " basic " [ " match +.IR EMATCH_TREE " ] [ " +.B action +.IR ACTION_SPEC " ] [ " +.B classid +.IR CLASSID " ]" +.SH DESCRIPTION +The +.B basic +filter allows to classify packets using the extended match infrastructure. +.SH OPTIONS +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.TP +.BI classid " CLASSID" +Push matching packets into the class identified by +.IR CLASSID . +.TP +.BI match " EMATCH_TREE" +Match packets using the extended match infrastructure. See +.BR tc-ematch (8) +for a detailed description of the allowed syntax in +.IR EMATCH_TREE . +.SH SEE ALSO +.BR tc (8), +.BR tc-ematch (8) From 5774f09ee8c6d36c34986bacea423b5b4dd1cb48 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 23 Oct 2015 19:47:09 +0200 Subject: [PATCH 036/151] tc: add a man page for cgroup filter Cc: Thomas Graf Signed-off-by: Phil Sutter --- man/man8/tc-cgroup.8 | 80 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 man/man8/tc-cgroup.8 diff --git a/man/man8/tc-cgroup.8 b/man/man8/tc-cgroup.8 new file mode 100644 index 00000000..2bea7d4a --- /dev/null +++ b/man/man8/tc-cgroup.8 @@ -0,0 +1,80 @@ +.TH "Cgroup classifier in tc" 8 " 21 Oct 2015" "iproute2" "Linux" + +.SH NAME +cgroup \- control group based traffic control filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... " cgroup " [ " match +.IR EMATCH_TREE " ] [ " +.B action +.IR ACTION_SPEC " ]" +.SH DESCRIPTION +This filter serves as a hint to +.B tc +that the assigned class ID of the net_cls control group the process the packet +originates from belongs to should be used for classification. Obviously, it is +useful for locally generated packets only. +.SH OPTIONS +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.TP +.BI match " EMATCH_TREE" +Match packets using the extended match infrastructure. See +.BR tc-ematch (8) +for a detailed description of the allowed syntax in +.IR EMATCH_TREE . +.SH EXAMPLES +In order to use this filter, a net_cls control group has to be created first and +class as well as process ID(s) assigned to it. The following creates a net_cls +cgroup named "foobar": + +.RS +.EX +modprobe cls_cgroup +mkdir /sys/fs/cgroup/net_cls +mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls +mkdir /sys/fs/cgroup/net_cls/foobar +.EE +.RE + +To assign a class ID to the created cgroup, a file named +.I net_cls.classid +has to be created which contains the class ID to be assigned as a hexadecimal, +64bit wide number. The upper 32bits are reserved for the major handle, the +remaining hold the minor. So a class ID of e.g. +.B ff:be +has to be written like so: +.B 0xff00be +(leading zeroes may be omitted). To continue the above example, the following +assigns class ID 1:2 to foobar cgroup: + +.RS +.EX +echo 0x10002 > /sys/fs/cgroup/net_cls/foobar/net_cls.classid +.EE +.RE + +Finally some PIDs can be assigned to the given cgroup: + +.RS +.EX +echo 1234 > /sys/fs/cgroup/net_cls/foobar/tasks +echo 5678 > /sys/fs/cgroup/net_cls/foobar/tasks +.EE +.RE + +Now by simply attaching a +.B cgroup +filter to a +.B qdisc +makes packets from PIDs 1234 and 5678 be pushed into class 1:2. + +.SH SEE ALSO +.BR tc (8), +.BR tc-ematch (8), +.br +the file +.I Documentation/cgroups/net_cls.txt +of the Linux kernel tree From 334ddc9b4d526d0fe4487464c1c4e0c0fcb35a30 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 23 Oct 2015 19:47:10 +0200 Subject: [PATCH 037/151] tc: add a man page for flow filter Cc: Patrick McHardy Signed-off-by: Phil Sutter --- man/man8/tc-flow.8 | 265 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 man/man8/tc-flow.8 diff --git a/man/man8/tc-flow.8 b/man/man8/tc-flow.8 new file mode 100644 index 00000000..f1b7e2a4 --- /dev/null +++ b/man/man8/tc-flow.8 @@ -0,0 +1,265 @@ +.TH "Flow filter in tc" 8 "20 Oct 2015" "iproute2" "Linux" + +.SH NAME +flow \- flow based traffic control filter +.SH SYNOPSIS +.TP +Mapping mode: + +.RS +.in +8 +.ti -8 +.BR tc " " filter " ... " "flow map key " +.IR KEY " [ " OPS " ] [ " OPTIONS " ] " +.RE +.TP +Hashing mode: + +.RS +.in +8 +.ti -8 +.BR tc " " filter " ... " "flow hash keys " +.IR KEY_LIST " [ " +.B perturb +.IR secs " ] [ " OPTIONS " ] " +.RE + +.in +8 +.ti -8 +.IR OPS " := [ " OPS " ] " OP + +.ti -8 +.IR OPTIONS " := [ " +.B divisor +.IR NUM " ] [ " +.B baseclass +.IR ID " ] [ " +.B match +.IR EMATCH_TREE " ] [ " +.B action +.IR ACTION_SPEC " ]" + +.ti -8 +.IR KEY_LIST " := [ " KEY_LIST " ] " KEY + +.ti -8 +.IR OP " := { " +.BR or " | " and " | " xor " | " rshift " | " addend " } " +.I NUM + +.ti -8 +.IR ID " := " X : Y + +.ti -8 +.IR KEY " := { " +.BR src " | " dst " | " proto " | " proto-src " | " proto-dst " | " iif " | " +.BR priority " | " mark " | " nfct " | " nfct-src " | " nfct-dst " | " +.BR nfct-proto-src " | " nfct-proto-dst " | " rt-classid " | " sk-uid " | " +.BR sk-gid " | " vlan-tag " | " rxhash " }" +.SH DESCRIPTION +The +.B flow +classifier is meant to extend the +.B SFQ +hashing capabilities without hard-coding new hash functions. It also allows +deterministic mappings of keys to classes. +.SH OPTIONS +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.TP +.BI baseclass " ID" +An offset for the resulting class ID. +.I ID +may be +.BR root ", " none +or a hexadecimal class ID in the form [\fIX\fB:\fR]\fIY\fR. If \fIX\fR is +omitted, it is assumed to be zero. +.TP +.BI divisor " NUM" +Number of buckets to use for sorting into. Keys are calculated modulo +.IR NUM . +.TP +.BI "hash keys " KEY-LIST +Perform a +.B jhash2 +operation over the keys in +.IR KEY-LIST , +the result (modulo the +.B divisor +if given) is taken as class ID, optionally offset by the value of +.BR baseclass . +It is possible to specify an interval (in seconds) after which +.BR jhash2 's +entropy source is recreated using the +.B perturb +parameter. +.TP +.BI "map key " KEY +Packet data identified by +.I KEY +is translated into class IDs to push the packet into. The value may be mangled by +.I OPS +before using it for the mapping. They are applied in the order listed here: +.RS +.TP 4 +.BI and " NUM" +Perform bitwise +.B AND +operation with numeric value +.IR NUM . +.TP +.BI or " NUM" +Perform bitwise +.B OR +operation with numeric value +.IR NUM . +.TP +.BI xor " NUM" +Perform bitwise +.B XOR +operation with numeric value +.IR NUM . +.TP +.BI rshift " NUM" +Shift the value of +.I KEY +to the right by +.I NUM +bits. +.TP +.BI addend " NUM" +Add +.I NUM +to the value of +.IR KEY . + +.RE +.RS +For the +.BR or ", " and ", " xor " and " rshift +operations, +.I NUM +is assumed to be an unsigned, 32bit integer value. For the +.B addend +operation, +.I NUM +may be much more complex: It may be prefixed by a minus ('-') sign to cause +subtraction instead of addition and for keys of +.BR src ", " dst ", " nfct-src " and " nfct-dst +it may be given in IP address notation. See below for an illustrating example. +.RE +.TP +.BI match " EMATCH_TREE" +Match packets using the extended match infrastructure. See +.BR tc-ematch (8) +for a detailed description of the allowed syntax in +.IR EMATCH_TREE . +.SH KEYS +In mapping mode, a single key is used (after optional permutation) to build a +class ID. The resulting ID is deducible in most cases. In hashing more, a number +of keys may be specified which are then hashed and the output used as class ID. +This ID is not deducible in beforehand, and may even change over time for a +given flow if a +.B perturb +interval has been given. + +The range of class IDs can be limited by the +.B divisor +option, which is used for a modulus. +.TP +.BR src ", " dst +Use source or destination address as key. In case of IPv4 and TIPC, this is the +actual address value. For IPv6, the 128bit address is folded into a 32bit value +by XOR'ing the four 32bit words. In all other cases, the kernel-internal socket +address is used (after folding into 32bits on 64bit systems). +.TP +.B proto +Use the layer four protocol number as key. +.TP +.B proto-src +Use the layer four source port as key. If not available, the kernel-internal +socket address is used instead. +.TP +.B proto-dst +Use the layer four destination port as key. If not available, the associated +kernel-internal dst_entry address is used after XOR'ing with the packet's +layer three protocol number. +.TP +.B iif +Use the incoming interface index as key. +.TP +.B priority +Use the packet's priority as key. Usually this is the IP header's DSCP/ECN +value. +.TP +.B mark +Use the netfilter +.B fwmark +as key. +.TP +.B nfct +Use the associated conntrack entry address as key. +.TP +.BR nfct-src ", " nfct-dst ", " nfct-proto-src ", " nfct-proto-dst +These are conntrack-aware variants of +.BR src ", " dst ", " proto-src " and " proto-dst . +In case of NAT, these are basically the packet header's values before NAT was +applied. +.TP +.B rt-classid +Use the packet's destination routing table entry's realm as key. +.TP +.B sk-uid +.TQ +.B sk-gid +For locally generated packets, use the user or group ID the originating socket +belongs to as key. +.TP +.B vlan-tag +Use the packet's vlan ID as key. +.TP +.B rxhash +Use the flow hash as key. + +.SH EXAMPLES +.TP +Classic SFQ hash: + +.EX +tc filter add ... flow hash \\ + keys src,dst,proto,proto-src,proto-dst divisor 1024 +.EE +.TP +Classic SFQ hash, but using information from conntrack to work properly in combination with NAT: + +.EX +tc filter add ... flow hash \\ + keys nfct-src,nfct-dst,proto,nfct-proto-src,nfct-proto-dst \\ + divisor 1024 +.EE +.TP +Map destination IPs of 192.168.0.0/24 to classids 1-257: + +.EX +tc filter add ... flow map \\ + key dst addend -192.168.0.0 divisor 256 +.EE +.TP +Alternative to the above: + +.EX +tc filter add ... flow map \\ + key dst and 0xff +.EE +.TP +The same, but in reverse order: + +.EX +tc filter add ... flow map \\ + key dst and 0xff xor 0xff +.EE +.SH SEE ALSO +.BR tc (8), +.BR tc-ematch (8), +.BR tc-sfq (8) From b3aa12a401d27911ebd48fe13f77783d471c07d2 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 23 Oct 2015 19:47:11 +0200 Subject: [PATCH 038/151] tc: add a man page for flower filter Cc: Jiri Pirko Signed-off-by: Phil Sutter --- man/man8/tc-flower.8 | 113 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 man/man8/tc-flower.8 diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8 new file mode 100644 index 00000000..df4d8e19 --- /dev/null +++ b/man/man8/tc-flower.8 @@ -0,0 +1,113 @@ +.TH "Flower filter in tc" 8 "22 Oct 2015" "iproute2" "Linux" + +.SH NAME +flower \- flow based traffic control filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... " flower " [ " +.IR MATCH_LIST " ] [ " +.B action +.IR ACTION_SPEC " ] [ " +.B classid +.IR CLASSID " ]" + +.ti -8 +.IR MATCH_LIST " := [ " MATCH_LIST " ] " MATCH + +.ti -8 +.IR MATCH " := { " +.B indev +.IR ifname " | { " +.BR dst_mac " | " src_mac " } " +.IR mac_address " | " +.BR eth_type " { " ipv4 " | " ipv6 " | " +.IR ETH_TYPE " } | " +.BR ip_proto " { " tcp " | " udp " | " +.IR IP_PROTO " } | { " +.BR dst_ip " | " src_ip " } { " +.IR ipv4_address " | " ipv6_address " } | { " +.BR dst_port " | " src_port " } " +.IR port_number " }" +.SH DESCRIPTION +The +.B flower +filter matches flows to the set of keys specified and assigns an arbitrarily +chosen class ID to packets belonging to them. Additionally (or alternatively) an +action from the generic action framework may be called. +.SH OPTIONS +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.TP +.BI classid " CLASSID" +Specify a class to pass matching packets on to. +.I CLASSID +is in the form +.BR X : Y ", while " X " and " Y +are interpreted as numbers in hexadecimal format. +.TP +.BI indev " ifname" +Match on incoming interface name. Obviously this makes sense only for forwarded +flows. +.I ifname +is the name of an interface which must exist at the time of +.B tc +invocation. +.TP +.BI dst_mac " mac_address" +.TQ +.BI src_mac " mac_address" +Match on source or destination MAC address. +.TP +.BI eth_type " ETH_TYPE" +Match on layer three protocol. +.I ETH_TYPE +may be either +.BR ipv4 , ipv6 +or an unsigned 16bit value in hexadecimal format. +.TP +.BI ip_proto " IP_PROTO" +Match on layer four protocol. +.I IP_PROTO +may be either +.BR tcp , udp +or an unsigned 8bit value in hexadecimal format. +.TP +.BI dst_ip " ADDRESS" +.TQ +.BI src_ip " ADDRESS" +Match on source or destination IP address. +.I ADDRESS +must be a valid IPv4 or IPv6 address, depending on +.BR ether_type , +which has to be specified in beforehand. +.TP +.BI dst_port " NUMBER" +.TQ +.BI src_port " NUMBER" +Match on layer 4 protocol source or destination port number. Only available for +.BR ip_proto " values " udp " and " tcp , +which has to be specified in beforehand. +.SH NOTES +As stated above where applicable, matches of a certain layer implicitly depend +on the matches of the next lower layer. Precisely, layer one and two matches ( +.BR indev , dst_mac , src_mac " and " eth_type ) +have no dependency, layer three matches ( +.BR ip_proto , dst_ip " and " src_ip ) +require +.B eth_type +being set to either +.BR ipv4 " or " ipv6 , +and finally layer four matches ( +.BR dst_port " and " src_port ) +depend on +.B ip_proto +being set to either +.BR tcp " or " udp . +.P +There can be only used one mask per one prio. If user needs to specify different +mask, he has to use different prio. +.SH SEE ALSO +.BR tc (8), +.BR tc-flow (8) From 49891ba177283d3bece364c50e7c9909e2b733b5 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 23 Oct 2015 19:47:12 +0200 Subject: [PATCH 039/151] tc: add a man page for fw filter Cc: Alexey Kuznetsov Signed-off-by: Phil Sutter --- man/man8/tc-fw.8 | 66 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 man/man8/tc-fw.8 diff --git a/man/man8/tc-fw.8 b/man/man8/tc-fw.8 new file mode 100644 index 00000000..d742b473 --- /dev/null +++ b/man/man8/tc-fw.8 @@ -0,0 +1,66 @@ +.TH "Firewall mark classifier in tc" 8 "21 Oct 2015" "iproute2" "Linux" + +.SH NAME +fw \- fwmark traffic control filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... " fw " [ " classid +.IR CLASSID " ] [ " +.B action +.IR ACTION_SPEC " ]" +.SH DESCRIPTION +the +.B fw +filter allows to classify packets based on a previously set +.BR fwmark " by " iptables . +If it is identical to the filter's +.BR handle , +the filter matches. +.B iptables +allows to mark single packets with the +.B MARK +target, or whole connections using +.BR CONNMARK . +The benefit of using this filter instead of doing the +heavy-lifting with +.B tc +itself is that on one hand it might be convenient to keep packet filtering and +classification in one place, possibly having to match a packet just once, and on +the other users familiar with +.BR iptables " but not " tc +will have a less hard time adding QoS to their setups. +.SH OPTIONS +.TP +.BI classid " CLASSID" +Push matching packets to the class identified by +.IR CLASSID . +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.SH EXAMPLES +Take e.g. the following tc filter statement: + +.RS +.EX +tc filter add ... handle 6 fw classid 1:1 +.EE +.RE + +will match if the packet's +.B fwmark +value is +.BR 6 . +This is a sample +.B iptables +statement marking packets coming in on eth0: + +.RS +.EX +iptables -t mangle -A PREROUTING -i eth0 -j MARK --set-mark 6 +.EE +.RE +.SH SEE ALSO +.BR tc (8), +.BR iptables (8), +.BR iptables-extensions (8) From 02dddd6110309ac37e72c418cfd96684dc763f3e Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 23 Oct 2015 19:47:13 +0200 Subject: [PATCH 040/151] tc: add a man page for route filter Cc: Alexey Kuznetsov Signed-off-by: Phil Sutter --- man/man8/tc-route.8 | 74 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 man/man8/tc-route.8 diff --git a/man/man8/tc-route.8 b/man/man8/tc-route.8 new file mode 100644 index 00000000..b865cd11 --- /dev/null +++ b/man/man8/tc-route.8 @@ -0,0 +1,74 @@ +.TH "Route classifier in tc" 8 "21 Oct 2015" "iproute2" "Linux" + +.SH NAME +route \- route traffic control filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... " route " [ " from +.IR REALM " | " +.B fromif +.IR TAG " ] [ " +.B to +.IR REALM " ] [ " +.B classid +.IR CLASSID " ] [ " +.B action +.IR ACTION_SPEC " ]" +.SH DESCRIPTION +Match packets based on routing table entries. This filter centers around the +possibility to assign a +.B realm +to routing table entries. For any packet to be classified by this filter, a +routing table lookup is performed and the returned +.B realm +is used to decide on whether the packet is a match or not. +.SH OPTIONS +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.TP +.BI classid " CLASSID" +Push matching packets into the class identified by +.IR CLASSID . +.TP +.BI from " REALM" +.TQ +.BI fromif " TAG" +Perform source route lookups. +.I TAG +is the name of an interface which must be present on the system at the time of +.B tc +invocation. +.TP +.BI to " REALM" +Match if normal (i.e., destination) routing returns the given +.IR REALM . +.SH EXAMPLES +Consider the subnet 192.168.2.0/24 being attached to eth0: + +.RS +.EX +ip route add 192.168.2.0/24 dev eth0 realm 2 +.EE +.RE + +The following +.B route +filter will then match packets from that subnet: + +.RS +.EX +tc filter add ... route from 2 classid 1:2 +.EE +.RE + +and pass packets on to class 1:2. +.SH NOTES +Due to implementation details, +.B realm +values must be in a range from 0 to 255, inclusive. Alternatively, a verbose +name defined in /etc/iproute2/rt_realms may be given instead. +.SH SEE ALSO +.BR tc (8), +.BR ip-route (8) From fc7a72f1ebcd8eeba8788a68f83b0454a1bc7730 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 23 Oct 2015 19:47:14 +0200 Subject: [PATCH 041/151] tc: add a man page for tcindex filter Cc: Werner Almesberger Signed-off-by: Phil Sutter --- man/man8/tc-tcindex.8 | 58 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 man/man8/tc-tcindex.8 diff --git a/man/man8/tc-tcindex.8 b/man/man8/tc-tcindex.8 new file mode 100644 index 00000000..7fcf8254 --- /dev/null +++ b/man/man8/tc-tcindex.8 @@ -0,0 +1,58 @@ +.TH "Traffic control index filter" 8 "21 Oct 2015" "iproute2" "Linux" + +.SH NAME +tcindex \- traffic control index filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... " tcindex " [ " hash +.IR SIZE " ] [ " +.B mask +.IR MASK " ] [ " +.B shift +.IR SHIFT " ] [ " +.BR pas_on " | " fall_through " ] [ " classid +.IR CLASSID " ] [ " +.B action +.BR ACTION_SPEC " ]" +.SH DESCRIPTION +This filter allows to match packets based on their +.B tcindex +field value, i.e. the combination of the DSCP and ECN fields as present in IPv4 +and IPv6 headers. +.SH OPTIONS +.TP +.BI action " ACTION_SPEC" +Apply an action from the generic actions framework on matching packets. +.TP +.BI classid " CLASSID" +Push matching packets into the class identified by +.IR CLASSID . +.TP +.BI hash " SIZE" +Hash table size in entries to use. Defaults to 64. +.TP +.BI mask " MASK" +An optional bitmask to binary +.BR AND " to the packet's " tcindex +field before use. +.TP +.BI shift " SHIFT" +The number of bits to right-shift a packet's +.B tcindex +value before use. If a +.B mask +has been set, masking is done before shifting. +.TP +.B pass_on +If this flag is set, failure to find a class for the resulting ID will make the +filter fail and lead to the next filter being consulted. +.TP +.B fall_through +This is the opposite of +.B pass_on +and the default. The filter will classify the packet even if there is no class +present for the resulting class ID. + +.SH SEE ALSO +.BR tc (8) From f15a23966fff35e484812ec1d733d9438f658644 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 23 Oct 2015 19:47:15 +0200 Subject: [PATCH 042/151] tc: add a man page for u32 filter Cc: Alexey Kuznetsov Signed-off-by: Phil Sutter --- man/man8/tc-u32.8 | 663 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 663 insertions(+) create mode 100644 man/man8/tc-u32.8 diff --git a/man/man8/tc-u32.8 b/man/man8/tc-u32.8 new file mode 100644 index 00000000..47c8f2d0 --- /dev/null +++ b/man/man8/tc-u32.8 @@ -0,0 +1,663 @@ +.TH "Universal 32bit classifier in tc" 8 "25 Sep 2015" "iproute2" "Linux" + +.SH NAME +u32 \- universal 32bit traffic control filter +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " " filter " ... [ " handle +.IR HANDLE " ] " +.B u32 +.IR OPTION_LIST " [ " +.B offset +.IR OFFSET " ] [ " +.B hashkey +.IR HASHKEY " ] [ " +.B classid +.IR CLASSID " ] [ " +.B divisor +.IR uint_value " ] [ " +.B order +.IR u32_value " ] [ " +.B ht +.IR HANDLE " ] [ " +.B sample +.IR SELECTOR " [ " +.B divisor +.IR uint_value " ] ] [ " +.B link +.IR HANDLE " ] [ " +.B indev +.IR ifname " ] [ " +.BR help " ]" + +.ti -8 +.IR HANDLE " := { " +\fIu12_hex_htid\fB:\fR[\fIu8_hex_hash\fB:\fR[\fIu12_hex_nodeid\fR] | \fB0x\fIu32_hex_value\fR } + +.ti -8 +.IR OPTION_LIST " := [ " OPTION_LIST " ] " OPTION + +.ti -8 +.IR HASHKEY " := [ " +.B mask +.IR u32_hex_value " ] [ " +.B at +.IR 4*int_value " ]" + +.ti -8 +.IR CLASSID " := { " +.BR root " | " +.BR none " | " +[\fIu16_major\fR]\fB:\fIu16_minor\fR | \fIu32_hex_value\fR } + +.ti -8 +.IR OFFSET " := [ " +.B plus +.IR int_value " ] [ " +.B at +.IR 2*int_value " ] [ " +.B mask +.IR u16_hex_value " ] [ " +.B shift +.IR int_value " ] [ " +.BR eat " ]" + +.ti -8 +.IR OPTION " := { " +.B match +.IR SELECTOR " | " +.B action +.IR ACTION " } " + +.ti -8 +.IR SELECTOR " := { " +.B u32 +.IR VAL_MASK_32 " | " +.B u16 +.IR VAL_MASK_16 " | " +.B u8 +.IR VAL_MASK_8 " | " +.B ip +.IR IP " | " +.B ip6 +.IR IP6 " | { " +.BR tcp " | " udp " } " +.IR TCPUDP " | " +.B icmp +.IR ICMP " | " +.B mark +.IR VAL_MASK_32 " | " +.B ether +.IR ETHER " }" + +.ti -8 +.IR IP " := { { " +.BR src " | " dst " } { " default " | " any " | " all " | " +.IR ip_address " [ " +.BR / " { " +.IR prefixlen " | " netmask " } ] } " AT " | { " +.BR dsfield " | " ihl " | " protocol " | " precedence " | " +.BR icmp_type " | " icmp_code " } " +.IR VAL_MASK_8 " | { " +.BR sport " | " dport " } " +.IR VAL_MASK_16 " | " +.BR nofrag " | " firstfrag " | " df " | " mf " }" + +.ti -8 +.IR IP6 " := { { " +.BR src " | " dst " } { " default " | " any " | " all " | " +.IR ip6_address " [/" prefixlen " ] } " AT " | " +.B priority +.IR VAL_MASK_8 " | { " +.BR protocol " | " icmp_type " | " icmp_code " } " +.IR VAL_MASK_8 " | " +.B flowlabel +.IR VAL_MASK_32 " | { " +.BR sport " | " dport " } " +.IR VAL_MASK_16 " }" + +.ti -8 +.IR TCPUDP " := { " +.BR src " | " dst " } " +.I VAL_MASK_16 + +.ti -8 +.IR ICMP " := { " +.B type +.IR VAL_MASK_8 " | " +.B code +.IR VAL_MASK_8 " }" + +.ti -8 +.IR ETHER " := { " +.BR src " | " dst " } " +.IR ether_address " " AT + +.ti -8 +.IR VAL_MASK_32 " := " u32_value " " u32_hex_mask " [ " AT " ]" + +.ti -8 +.IR VAL_MASK_16 " := " u16_value " " u16_hex_mask " [ " AT " ]" + +.ti -8 +.IR VAL_MASK_8 " := " u8_value " " u8_hex_mask " [ " AT " ]" + +.ti -8 +.IR AT " := [ " +.BR at " [ " nexthdr+ " ] " +.IR int_value " ]" +.SH DESCRIPTION +The Universal/Ugly 32bit filter allows to match arbitrary bitfields in the +packet. Due to breaking everything down to values, masks and offsets, It is +equally powerful and hard to use. Luckily many abstracting directives are +present which allow defining rules on a higher level and therefore free the +user from having to fiddle with bits and masks in many cases. + +There are two general modes of invocation: The first mode creates a new filter +to delegate packets to different destinations. Apart from the obvious ones, +namely classifying the packet by specifying a +.I CLASSID +or calling an +.BR action , +one may +.B link +one filter to another one (or even a list of them), effectively organizing +filters into a tree-like hierarchy. + +Typically filter delegation is done by means of a hash table, which leads to the +second mode of invocation: it merely serves to set up these hash tables. Filters +can select a hash table and provide a key selector from which a hash is to be +computed and used as key to lookup the table's bucket which contains filters for +further processing. This is useful if a high number of filters is in use, as the +overhead of performing the hash operation and table lookup becomes negligible in +that case. Using hashtables with +.B u32 +basically involves the following pattern: +.IP (1) 4 +Creating a new hash table, specifying it's size using the +.B divisor +parameter and ideally a handle by which the table can be identified. If the +latter is not given, the kernel chooses one on it's own, which has to be +guessed later. +.IP (2) 4 +Creating filters which link to the created table in +.I (1) +using the +.B link +parameter and defining the packet data which the kernel will use to calculate +the +.BR hashkey . +.IP (3) 4 +Adding filters to buckets in the hash table from +.IR (1) . +In order to avoid having to know how exactly the kernel creates the hash key, +there is the +.B sample +parameter, which gives sample data to hash and thereby define the table bucket +the filter should be added to. + +.RE +In fact, even if not explicitly requested +.B u32 +creates a hash table for every +.B priority +a filter is being added with. The table's size is 1 though, so it is in fact +merely a linked list. +.SH VALUES +Options and selectors require values to be specified in a specific format, which +is often non-intuitive. Therefore the terminals in +.I SYNOPSIS +have been given descriptive names to indicate the required format and/or maximum +allowed numeric value: Prefixes +.IR u32 ", " u16 " and " u8 +indicate four, two and single byte unsigned values. E.g. +.I u16 +indicates a two byte-sized value in range between 0 and 65535 (0xFFFF) +inclusive. A prefix of +.I int +indicates a four byte signed value. A middle part of +.I _hex_ +indicates that the value is parsed in hexadecimal format. Otherwise, the +value's base is automatically detected, i.e. values prefixed with +.I 0x +are considered hexadecimal, a leading +.I 0 +indicates octal format and decimal format otherwise. There are some values with +special formatting as well: +.IR ip_address " and " netmask +are in dotted-quad formatting as usual for IPv4 addresses. An +.I ip6_address +is specified in common, colon-separated hexadecimal format. Finally, +.I prefixlen +is an unsigned, decimal integer value in range from 0 to the address width in +bits (32 for IPv4 and 128 for IPv6). + +Sometimes values need to be dividable by a certain number. In that case a name +of the form +.I N*val +was chosen, indicating that +.I val +must be dividable by +.IR N . +Or the other way around: the resulting value must be a multiple of +.IR N . +.SH OPTIONS +.B U32 +recognizes the following options: +.TP +.BI handle " HANDLE" +The handle is used to reference a filter and therefore must be unique. It +consists of a hash table identifier +.B htid +and optional +.B hash +(which identifies the hash table's bucket) and +.BR nodeid . +All these values are parsed as unsigned, hexadecimal numbers with length 12bits +( +.BR htid " and " nodeid ) +or 8bits ( +.BR hash ). +Alternatively one may specify a single, 32bit long hex number which contains +the three fields bits in concatenated form. Other than the fields themselves, it +has to be prefixed by +.BR 0x . +.TP +.BI offset " OFFSET" +Set an offset which defines where matches of subsequent filters are applied to. +Therefore this option is useful only when combined with +.BR link " or a combination of " ht " and " sample . +The offset may be given explicitly by using the +.B plus +keyword, or extracted from the packet data with +.BR at . +It is possible to mangle the latter using +.BR mask " and/or " shift +keywords. By default, this offset is recorded but not implicitly applied. It is +used only to substitute the +.B nexthdr+ +statement. Using the keyword +.B eat +though inverses this behaviour: the offset is applied always, and +.B nexthdr+ +will fall back to zero. +.TP +.BI hashkey " HASHKEY" +Spefify what packet data to use to calculate a hash key for bucket lookup. The +kernel adjusts the value according to the hash table's size. For this to work, +the option +.B link +must be given. +.TP +.BI classid " CLASSID" +Classify matching packets into the given +.IR CLASSID , +which consists of either 16bit +.BR major " and " minor +numbers or a single 32bit value combining both. +.TP +.BI divisor " u32_value" +Specify a modulo value. Used when creating hash tables to define their size or +for declaring a +.B sample +to calculate hash table keys from. Must be a power of two with exponent not +exceeding eight. +.TP +.BI order " u32_value" +A value to order filters by, ascending. Conflicts with +.B handle +which serves the same purpose. +.TP +.BI sample " SELECTOR" +Used together with +.B ht +to specify which bucket to add this filter to. This allows one to avoid having +to know how exactly the kernel calculates hashes. The additional +.B divisor +defaults to 256, so must be given for hash tables of different size. +.TP +.BI link " HANDLE" +Delegate matching packets to filters in a hash table. +.I HANDLE +is used to only specify the hash table, so only +.BR htid " may be given, " hash " and " nodeid +have to be omitted. By default, bucket number 0 will be used and can be +overridden by the +.B hashkey +option. +.TP +.BI indev " ifname" +Filter on the incoming interface of the packet. Obviously works only for +forwarded traffic. +.TP +.BI help +Print a brief help text about possible options. +.SH SELECTORS +Basically the only real selector is +.B u32 . +All others merely provide a higher level syntax and are internally translated +into +.B u32 . +.TP +.BI u32 " VAL_MASK_32" +.TQ +.BI u16 " VAL_MASK_16" +.TQ +.BI u8 " VAL_MASK_8" +Match packet data to a given value. The selector name defines the sample length +to extract (32bits for +.BR u32 , +16bits for +.B u16 +and 8bits for +.BR u8 ). +Before comparing, the sample is binary AND'ed with the given mask. This way +uninteresting bits can be cleared before comparison. The position of the sample +is defined by the offset specified in +.IR AT . +.TP +.BI ip " IP" +.TQ +.BI ip6 " IP6" +Assume packet starts with an IPv4 ( +.BR ip ) +or IPv6 ( +.BR ip6 ) +header. +.IR IP / IP6 +then allows to match various header fields: +.RS +.TP +.BI src " ADDR" +.BI dst " ADDR" +Compare Source or Destination Address fields against the value of +.IR ADDR . +The reserved words +.BR default ", " any " and " all +effectively match any address. Otherwise an IP address of the particular +protocol is expected, optionally suffixed by a prefix length to match whole +subnets. In case of IPv4 a netmask may also be given. +.TP +.BI dsfield " VAL_MASK_8" +IPv4 only. Match the packet header's DSCP/ECN field. Synonyms to this are +.BR tos " and " precedence . +.TP +.BI ihl " VAL_MASK_8" +IPv4 only. Match the Internet Header Length field. Note that the value's unit is +32bits, so to match a packet with 24byte header length +.I u8_value +has to be 6. +.TP +.BI protocol " VAL_MASK_8" +Match the Protocol (IPv4) or Next Header (IPv6) field value, e.g. 6 for TCP. +.TP +.BI icmp_type " VAL_MASK_8" +.TQ +.BI icmp_code " VAL_MASK_8" +Assume a next-header protocol of icmp or ipv6-icmp and match Type or Code +field values. This is dangerous, as the code assumes minimal header size for +IPv4 and lack of extension headers for IPv6. +.TP +.BI sport " VAL_MASK_16" +.TQ +.BI dport " VAL_MASK_16" +Match layer four source or destination ports. This is dangerous as well, as it +assumes a suitable layer four protocol is present (which has Source and +Destination Port fields right at the start of the header and 16bit in size). +Also minimal header size for IPv4 and lack of IPv6 extension headers is assumed. +.TP +.B nofrag +.TQ +.B firstfrag +.TQ +.B df +.TQ +.B mf +IPv4 only, check certain flags and fragment offset values. Match if the packet +is not a fragment +.RB ( nofrag ), +the first fragment +.RB ( firstfrag ), +if Don't Fragment +.RB ( df ) +or More Fragments +.RB ( mf ) +bits are set. +.TP +.BI priority " VAL_MASK_8" +IPv6 only. Match the header's Traffic Class field, which has the same purpose +and semantics of IPv4's ToS field since RFC 3168: upper six bits are DSCP, the +lower two ECN. +.TP +.BI flowlabel " VAL_MASK_32" +IPv6 only. Match the Flow Label field's value. Note that Flow Label itself is +only 20bytes long, which are the least significant ones here. The remaining +upper 12bytes match Version and Traffic Class fields. +.RE +.TP +.BI tcp " TCPUDP" +.TQ +.BI udp " TCPUDP" +Match fields of next header of protocol TCP or UDP. The possible values for +.I TCPDUP +are: +.RS +.TP +.BI src " VAL_MASK_16" +Match on Source Port field value. +.TP +.BI dst " VALMASK_16" +Match on Destination Port field value. +.RE +.TP +.BI icmp " ICMP" +Match fields of next header of protocol ICMP. The possible values for +.I ICMP +are: +.RS +.TP +.BI type " VAL_MASK_8" +Match on ICMP Type field. +.TP +.BI code " VAL_MASK_8" +Match on ICMP Code field. +.RE +.TP +.BI mark " VAL_MASK_32" +Match on netfilter fwmark value. +.TP +.BI ether " ETHER" +Match on ethernet header fields. Possible values for +.I ETHER +are: +.RS +.TP +.BI src " ether_address" " " AT +.TQ +.BI dst " ether_address" " " AT +Match on source or destination ethernet address. This is dangerous: It assumes +an ethernet header is present at the start of the packet. This will probably +lead to unexpected things if used with layer three interfaces like e.g. tun or +ppp. +.SH EXAMPLES +.RS +.EX +tc filter add dev eth0 parent 999:0 prio 99 protocol ip u32 \\ + match ip src 192.168.8.0/24 classid 1:1 +.EE +.RE + +This attaches a filter to the qdisc identified by +.BR 999:0. +It's priority is +.BR 99 , +which affects in which order multiple filters attached to the same +.B parent +are consulted (the lower the earlier). The filter handles packets of +.B protocol +type +.BR ip , +and +.BR match es +if the IP header's source address is within the +.B 192.168.8.0/24 +subnet. Matching packets are classified into class +.BR 1.1 . +The effect of this command might be surprising at first glance: + +.RS +.EX +filter parent 1: protocol ip pref 99 u32 +filter parent 1: protocol ip pref 99 u32 \\ + fh 800: ht divisor 1 +filter parent 1: protocol ip pref 99 u32 \\ + fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 \\ + match c0a80800/ffffff00 at 12 +.EE +.RE + +So parent +.B 1: +is assigned a new +.B u32 +filter, which contains a hash table of size 1 (as the +.B divisor +indicates). The table ID is +.BR 800 . +The third line then shows the actual filter which was added above: it sits in +table +.B 800 +and bucket +.BR 0 , +classifies packets into class ID +.B 1:1 +and matches the upper three bytes of the four byte value at offset +.B 12 +to be +.BR 0xc0a808 , +which is 192, 168 and 8. + +Now for something more complicated, namely creating a custom hash table: + +.RS +.EX +tc filter add dev eth0 prio 99 handle 1: u32 divisor 256 +.EE +.RE + +This creates a table of size 256 with handle +.B 1: +in priority +.BR 99 . +The effect is as follows: + +.RS +.EX +filter parent 1: protocol all pref 99 u32 +filter parent 1: protocol all pref 99 u32 fh 1: ht divisor 256 +filter parent 1: protocol all pref 99 u32 fh 800: ht divisor 1 +.EE +.RE + +So along with the requested hash table (handle +.BR 1: ), +the kernel has created his own table of size 1 to hold other filters of the same +priority. + +The next step is to create a filter which links to the created hash table: + +.RS +.EX +tc filter add dev eth0 parent 1: prio 1 u32 \\ + link 1: hashkey mask 0x0000ff00 at 12 \\ + match ip src 192.168.0.0/16 +.EE +.RE + +The filter is given a lower priority than the hash table itself so +.B u32 +consults it before manually traversing the hash table. The options +.BR link " and " hashkey +determine which table and bucket to redirect to. In this case the hash key +should be constructed out of the second byte at offset 12, which corresponds to +an IP packet's third byte of the source address field. Along with the +.B match +statement, this effectively maps all class C networks below 192.168.0.0/16 to +different buckets of the hash table. + +Filters for certain subnets can be created like so: + +.RS +.EX +tc filter add dev eth0 parent 1: prio 99 u32 \\ + ht 1: sample u32 0x00000800 0x0000ff00 at 12 \\ + match ip src 192.168.8.0/24 classid 1:1 +.EE +.RE + +The bucket is defined using the +.B sample +option: In this case, the second byte at offset 12 must be 0x08, exactly. In +this case, the resulting bucket ID is obviously 8, but as soon as +.B sample +selects an amount of data which could exceed the +.BR divisor , +one would have to know the kernel-internal algorithm to deduce the destination +bucket. This filter's +.B match +statement is redundant in this case, as the entropy for the hash key does not +exceed the table size and therefore no collisions can occur. Otherwise it's +necessary to prevent matching unwanted packets. + +Matching upper layer fields is problematic since IPv4 header length is variable +and IPv6 supports extension headers which affect upper layer header offset. To +overcome this, there is the possibility to specify +.B nexthdr+ +when giving an offset, and to make things easier there are the +.BR tcp " and " udp +matches which use +.B nexthdr+ +implicitly. This offset has to be calculated in beforehand though, and the only +way to achieve that is by doing it in a separate filter which then links to the +filter which wants to use it. Here is an example of doing so: + +.RS +.EX +tc filter add dev eth0 parent 1:0 protocol ip handle 1: \\ + u32 divisor 1 +tc filter add dev eth0 parent 1:0 protocol ip \\ + u32 ht 1: \\ + match tcp src 22 FFFF \\ + classid 1:2 +tc filter add dev eth0 parent 1:0 protocol ip \\ + u32 ht 800: \\ + match ip protocol 6 FF \\ + match ip firstfrag \\ + offset at 0 mask 0f00 shift 6 \\ + link 1: +.EE +.RE + +This is what is being done: In the first call, a single element sized hash table +is created so there is a place to hold the linked to filter and a known handle +.RB ( 1: ) +to reference to it. The second call then adds the actual filter, which pushes +packets with TCP source port 22 into class +.BR 1:2 . +Using +.BR ht , +it is moved into the hash table created by the first call. The third call then +does the actual magic: It matches IPv4 packets with next layer protocol 6 (TCP), +only if it's the first fragment (usually TCP sets DF bit, but if it doesn't and +the packet is fragmented, only the first one contains the TCP header), and then +sets the offset based on the IP header's IHL field (right-shifting by 6 +eliminates the offset of the field and at the same time converts the value into +byte unit). Finally, using +.BR link , +the hash table from first call is referenced which holds the filter from second +call. +.SH SEE ALSO +.BR tc (8), +.br +.BR cls_u32.txt " at " http://linux-tc-notes.sourceforge.net/ From a257bc7b4c481d4b2871508edfccc198d1e56c8a Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 23 Oct 2015 19:47:16 +0200 Subject: [PATCH 043/151] tc: ship filter man pages and refer to them in tc.8 Cc: Thomas Graf Cc: Alexey Kuznetsov Cc: Jiri Pirko Cc: Patrick McHardy Cc: Werner Almesberger Signed-off-by: Phil Sutter --- man/man8/Makefile | 4 +++- man/man8/tc.8 | 52 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/man/man8/Makefile b/man/man8/Makefile index 1845987d..2f776406 100644 --- a/man/man8/Makefile +++ b/man/man8/Makefile @@ -12,7 +12,9 @@ MAN8PAGES = $(TARGETS) ip.8 arpd.8 lnstat.8 routel.8 rtacct.8 rtmon.8 rtpr.8 ss. ip-netns.8 ip-ntable.8 ip-rule.8 ip-tunnel.8 ip-xfrm.8 \ ip-tcp_metrics.8 ip-netconf.8 ip-token.8 \ tipc.8 tipc-bearer.8 tipc-link.8 tipc-media.8 tipc-nametable.8 \ - tipc-node.8 tipc-socket.8 + tipc-node.8 tipc-socket.8 \ + tc-basic.8 tc-cgroup.8 tc-flow.8 tc-flower.8 tc-fw.8 tc-route.8 \ + tc-tcindex.8 tc-u32.8 all: $(TARGETS) diff --git a/man/man8/tc.8 b/man/man8/tc.8 index 87350113..700b960c 100644 --- a/man/man8/tc.8 +++ b/man/man8/tc.8 @@ -144,6 +144,50 @@ It is important to notice that filters reside .B within qdiscs - they are not masters of what happens. +The available filters are: +.TP +basic +Filter packets based on an ematch expression. See +.BR tc-ematch (8) +for details. +.TP +bpf +Filter packets using (e)BPF, see +.BR tc-bpf (8) +for details. +.TP +cgroup +Filter packets based on the control group of their process. See +. BR tc-cgroup (8) +for details. +.TP +flow, flower +Flow-based classifiers, filtering packets based on their flow (identified by selectable keys). See +.BR tc-flow "(8) and" +.BR tc-flower (8) +for details. +.TP +fw +Filter based on fwmark. Directly maps fwmark value to traffic class. See +.BR tc-fw (8). +.TP +route +Filter packets based on routing table. See +.BR tc-route (8) +for details. +.TP +rsvp +Match Resource Reservation Protocol (RSVP) packets. +.TP +tcindex +Filter packets based on traffic control index. See +.BR tc-index (8). +.TP +u32 +Generic filtering on arbitrary packet data, assisted by syntax to abstract common operations. See +.BR tc-u32 (8) +for details. + .SH CLASSLESS QDISCS The classless qdiscs are: .TP @@ -655,15 +699,20 @@ Shows classes as ASCII graph with stats info under each class. .B tc was written by Alexey N. Kuznetsov and added in Linux 2.2. .SH SEE ALSO +.BR tc-basic (8), .BR tc-bfifo (8), .BR tc-bpf (8), .BR tc-cbq (8), +.BR tc-cgroup (8), .BR tc-choke (8), .BR tc-codel (8), .BR tc-drr (8), .BR tc-ematch (8), +.BR tc-flow (8), +.BR tc-flower (8), .BR tc-fq (8), .BR tc-fq_codel (8), +.BR tc-fw (8), .BR tc-hfsc (7), .BR tc-hfsc (8), .BR tc-htb (8), @@ -671,10 +720,13 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2. .BR tc-pfifo (8), .BR tc-pfifo_fast (8), .BR tc-red (8), +.BR tc-route (8), .BR tc-sfb (8), .BR tc-sfq (8), .BR tc-stab (8), .BR tc-tbf (8), +.BR tc-tcindex (8), +.BR tc-u32 (8), .br .RB "User documentation at " http://lartc.org/ ", but please direct bugreports and patches to: " From f7520a1998995f29730692212c1c87a39064e168 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 23 Oct 2015 15:41:58 -0700 Subject: [PATCH 044/151] ip: remove extra newlines at end-of-file Shouldn't have extra blank lines. --- ip/ipaddrlabel.c | 1 - ip/ipfou.c | 1 - ip/iplink_bond_slave.c | 1 - ip/ipprefix.c | 1 - ip/iproute.c | 1 - ip/tcp_metrics.c | 1 - 6 files changed, 6 deletions(-) diff --git a/ip/ipaddrlabel.c b/ip/ipaddrlabel.c index a738ded0..f01bc269 100644 --- a/ip/ipaddrlabel.c +++ b/ip/ipaddrlabel.c @@ -263,4 +263,3 @@ int do_ipaddrlabel(int argc, char **argv) fprintf(stderr, "Command \"%s\" is unknown, try \"ip addrlabel help\".\n", *argv); exit(-1); } - diff --git a/ip/ipfou.c b/ip/ipfou.c index 0b83c277..8a86b18f 100644 --- a/ip/ipfou.c +++ b/ip/ipfou.c @@ -156,4 +156,3 @@ int do_ipfou(int argc, char **argv) fprintf(stderr, "Command \"%s\" is unknown, try \"ip fou help\".\n", *argv); exit(-1); } - diff --git a/ip/iplink_bond_slave.c b/ip/iplink_bond_slave.c index 33bea913..9b569b1d 100644 --- a/ip/iplink_bond_slave.c +++ b/ip/iplink_bond_slave.c @@ -113,4 +113,3 @@ struct link_util bond_slave_link_util = { .parse_opt = bond_slave_parse_opt, .slave = true, }; - diff --git a/ip/ipprefix.c b/ip/ipprefix.c index 26b59615..ee51f04d 100644 --- a/ip/ipprefix.c +++ b/ip/ipprefix.c @@ -107,4 +107,3 @@ int print_prefix(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) return 0; } - diff --git a/ip/iproute.c b/ip/iproute.c index ae86cc0d..eab512d9 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -1811,4 +1811,3 @@ int do_iproute(int argc, char **argv) fprintf(stderr, "Command \"%s\" is unknown, try \"ip route help\".\n", *argv); exit(-1); } - diff --git a/ip/tcp_metrics.c b/ip/tcp_metrics.c index bdc503ef..57b605fd 100644 --- a/ip/tcp_metrics.c +++ b/ip/tcp_metrics.c @@ -508,4 +508,3 @@ int do_tcp_metrics(int argc, char **argv) "try \"ip tcp_metrics help\".\n", *argv); exit(-1); } - From 753ef5bbd60891437739e8ceee711957fa7a4bd2 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 23 Oct 2015 15:43:28 -0700 Subject: [PATCH 045/151] tc: remove extra whitespace No blank lines at EOF, or trailing whitespace. --- tc/README.last | 2 -- tc/emp_ematch.y | 1 - tc/m_action.c | 1 - tc/m_ipt.c | 1 - tc/m_xt_old.c | 1 - tc/p_tcp.c | 2 -- tc/p_udp.c | 1 - tc/q_cbq.c | 1 - tc/q_netem.c | 1 - tc/q_prio.c | 1 - tc/q_tbf.c | 1 - tc/tc_filter.c | 1 - tc/tc_stab.c | 1 - tc/tc_util.c | 1 - 14 files changed, 16 deletions(-) diff --git a/tc/README.last b/tc/README.last index 9400438a..63f6f7b0 100644 --- a/tc/README.last +++ b/tc/README.last @@ -43,5 +43,3 @@ It is available only for alpha and pentiums with correct CPU timestamp. It is the fastest way, use it when it is available, but remember: not all pentiums have this facility, and a lot of them have clock, broken by APM etc. etc. - - diff --git a/tc/emp_ematch.y b/tc/emp_ematch.y index bc08da20..2e6cf353 100644 --- a/tc/emp_ematch.y +++ b/tc/emp_ematch.y @@ -98,4 +98,3 @@ invert: { ematch_err = strdup(s); } - diff --git a/tc/m_action.c b/tc/m_action.c index d363d273..f88ff3a1 100644 --- a/tc/m_action.c +++ b/tc/m_action.c @@ -648,4 +648,3 @@ int do_action(int argc, char **argv) return 0; } - diff --git a/tc/m_ipt.c b/tc/m_ipt.c index e5c48977..948becbc 100644 --- a/tc/m_ipt.c +++ b/tc/m_ipt.c @@ -618,4 +618,3 @@ struct action_util ipt_action_util = { .parse_aopt = parse_ipt, .print_aopt = print_ipt, }; - diff --git a/tc/m_xt_old.c b/tc/m_xt_old.c index 0ea0b4a9..6e643088 100644 --- a/tc/m_xt_old.c +++ b/tc/m_xt_old.c @@ -433,4 +433,3 @@ struct action_util ipt_action_util = { .parse_aopt = parse_ipt, .print_aopt = print_ipt, }; - diff --git a/tc/p_tcp.c b/tc/p_tcp.c index 7f4b6f4a..32ffc027 100644 --- a/tc/p_tcp.c +++ b/tc/p_tcp.c @@ -34,5 +34,3 @@ struct m_pedit_util p_pedit_tcp = { "tcp", parse_tcp, }; - - diff --git a/tc/p_udp.c b/tc/p_udp.c index 17762896..2b9b88fc 100644 --- a/tc/p_udp.c +++ b/tc/p_udp.c @@ -35,4 +35,3 @@ struct m_pedit_util p_pedit_udp = { "udp", parse_udp, }; - diff --git a/tc/q_cbq.c b/tc/q_cbq.c index d76600cc..38a61630 100644 --- a/tc/q_cbq.c +++ b/tc/q_cbq.c @@ -582,4 +582,3 @@ struct qdisc_util cbq_qdisc_util = { .parse_copt = cbq_parse_class_opt, .print_copt = cbq_print_opt, }; - diff --git a/tc/q_netem.c b/tc/q_netem.c index cd990a0a..7bc8c6a5 100644 --- a/tc/q_netem.c +++ b/tc/q_netem.c @@ -688,4 +688,3 @@ struct qdisc_util netem_qdisc_util = { .parse_qopt = netem_parse_opt, .print_qopt = netem_print_opt, }; - diff --git a/tc/q_prio.c b/tc/q_prio.c index bacc7024..3236bec1 100644 --- a/tc/q_prio.c +++ b/tc/q_prio.c @@ -122,4 +122,3 @@ struct qdisc_util prio_qdisc_util = { .parse_qopt = prio_parse_opt, .print_qopt = prio_print_opt, }; - diff --git a/tc/q_tbf.c b/tc/q_tbf.c index 2d563311..0981e6f7 100644 --- a/tc/q_tbf.c +++ b/tc/q_tbf.c @@ -328,4 +328,3 @@ struct qdisc_util tbf_qdisc_util = { .parse_qopt = tbf_parse_opt, .print_qopt = tbf_print_opt, }; - diff --git a/tc/tc_filter.c b/tc/tc_filter.c index 9e416008..ff03db8f 100644 --- a/tc/tc_filter.c +++ b/tc/tc_filter.c @@ -375,4 +375,3 @@ int do_filter(int argc, char **argv) fprintf(stderr, "Command \"%s\" is unknown, try \"tc filter help\".\n", *argv); return -1; } - diff --git a/tc/tc_stab.c b/tc/tc_stab.c index 286681f3..aba8ae87 100644 --- a/tc/tc_stab.c +++ b/tc/tc_stab.c @@ -148,4 +148,3 @@ void print_size_table(FILE *fp, const char *prefix, struct rtattr *rta) } #endif } - diff --git a/tc/tc_util.c b/tc/tc_util.c index aa6de244..4764ecce 100644 --- a/tc/tc_util.c +++ b/tc/tc_util.c @@ -608,4 +608,3 @@ compat_xstats: if (tb[TCA_XSTATS] && xstats) *xstats = tb[TCA_XSTATS]; } - From 1473bda921dafc2b8b631472c8398223dfa8c528 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 23 Oct 2015 15:44:30 -0700 Subject: [PATCH 046/151] misc: cleanup extra whitespace No blank lines at end of file --- misc/ifstat.c | 2 +- misc/lnstat.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/misc/ifstat.c b/misc/ifstat.c index 9118c80b..20a8db45 100644 --- a/misc/ifstat.c +++ b/misc/ifstat.c @@ -272,7 +272,7 @@ static void dump_raw_db(FILE *fp, int to_hist) if (jw) { jsonw_name(jw, n->name); jsonw_start_object(jw); - + for (i=0; i Date: Fri, 23 Oct 2015 15:47:07 -0700 Subject: [PATCH 047/151] add new IFLA_VF_TRUST netlink attribute --- include/linux/if_link.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 288d3cd6..31f584cb 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -548,6 +548,7 @@ enum { * on/off switch */ IFLA_VF_STATS, /* network device statistics */ + IFLA_VF_TRUST, /* Trust VF */ __IFLA_VF_MAX, }; @@ -609,6 +610,11 @@ enum { #define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1) +struct ifla_vf_trust { + __u32 vf; + __u32 setting; +}; + /* VF ports management section * * Nested layout of set/get msg is: From 958cd210942c8d4c1756957843bd2bf52b57ebb5 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 29 Oct 2015 10:55:21 +0100 Subject: [PATCH 048/151] ifcfg: add manpage --- man/man8/ifcfg.8 | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 man/man8/ifcfg.8 diff --git a/man/man8/ifcfg.8 b/man/man8/ifcfg.8 new file mode 100644 index 00000000..79033bde --- /dev/null +++ b/man/man8/ifcfg.8 @@ -0,0 +1,48 @@ +.TH IFCFG 8 "September 24 2009" "iproute2" "Linux" +.SH NAME +ifcfg \- simplistic script which replaces ifconfig IP managment +.SH SYNOPSIS +.ad l +.in +8 +.ti -8 +.B ifcfg +.RI "[ " DEVICE " ] [ " command " ] " ADDRESS " [ " PEER " ] " +.sp + +.SH DESCRIPTION +This manual page documents briefly the +.B ifcfg +command. +.PP +This is a simplistic script replacing one option of +.B ifconfig +, namely, IP address management. It not only adds +addresses, but also carries out Duplicate Address Detection RFC-DHCP, +sends unsolicited ARP to update the caches of other hosts sharing +the interface, adds some control routes and restarts Router Discovery +when it is necessary. + +.SH IFCONFIG - COMMAND SYNTAX + +.SS +.TP +.B DEVICE +- it may have alias, suffix, separated by colon. + +.TP +.B command +- add, delete or stop. + +.TP +.B ADDRESS +- optionally followed by prefix length. + +.TP +.B peer +- optional peer address for pointpoint interfaces. + +.SH NOTES +This script is not suitable for use with IPv6. + +.SH SEE ALSO +.RB "IP Command reference " ip-cref.ps From 7124942942e530299b3d50b2b11b30265065dd5c Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 29 Oct 2015 10:55:22 +0100 Subject: [PATCH 049/151] genl: add manpage --- man/man8/genl.8 | 77 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 man/man8/genl.8 diff --git a/man/man8/genl.8 b/man/man8/genl.8 new file mode 100644 index 00000000..b9de594d --- /dev/null +++ b/man/man8/genl.8 @@ -0,0 +1,77 @@ +.TH GENL 8 "29 Oct 2015" "iproute2" "Linux" +.SH NAME +genl \- generic netlink utility frontend +.SH SYNOPSIS +.in +8 +.ti -8 +.BR genl " [ " -s [ tatistics "] ] [ " -d [ etails "] ] [ " -r [ aw "] ] " OBJECT + +.ti -8 +.BR genl " { " -V [ ersion "] | " -h [ elp "] }" + +.ti -8 +.IR OBJECT " := { " +.B ctrl +.IR CTRL_OPTS " }" + +.ti -8 +.IR CTRL_OPTS " := { " +.BR help " | " list " | " monitor " | " get +.IR PARMS " }" + +.ti -8 +.IR PARMS " := { " +.B name +.IR NAME " | " +.B id +.IR ID " }" +.SH DESCRIPTION +The +.B genl +utility provides a simple frontend to the generic netlink library. Although it's +designed to support multiple +.IR OBJECT s, +for now only the +.B ctrl +object is available, which is used to query the generic netlink controller. +.SS ctrl +The generic netlink controller can be queried in various ways: +.TP +.B help +This command just prints a help text for the +.B ctrl +object. +.TP +.B list +Show the registered netlink users. +.TP +.B monitor +Listen for generic netlink notifications. +.TP +.B get +Query the controller for a given user, identified either by +.BR name " or " id . +.SH OPTIONS +genl supports the following options. +.TP +.B \-h, \-help +Show summary of options. +.TP +.B \-V, \-Version +Show version of program. +.TP +.B \-s, \-stats, \-statistics +Show object statistics. +.TP +.B \-d, \-details +Show object details. +.TP +.B \-r, \-raw +Dump raw output only. +.SH SEE ALSO +.BR ip (8) +.br +.SH AUTHOR +genl was written by Jamal Hadi Salim . +.PP +This manual page was written by Petr Sabata . From 17c53fcd2c76b84c7c7eb2bd082d9e40dd8c41c6 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 29 Oct 2015 10:55:23 +0100 Subject: [PATCH 050/151] ifstat: add manpage --- man/man8/ifstat.8 | 59 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 man/man8/ifstat.8 diff --git a/man/man8/ifstat.8 b/man/man8/ifstat.8 new file mode 100644 index 00000000..e49d8680 --- /dev/null +++ b/man/man8/ifstat.8 @@ -0,0 +1,59 @@ +.TH IFSTAT 8 "28 Oct 2015" "iproute2" "Linux" +.SH NAME +ifstat \- handy utility to read network interface statistics +.SH SYNOPSIS +.in +8 +.ti -8 +.BR ifstat " [ " +.IR OPTIONS " ] [ " INTERFACE_LIST " ]" + +.ti -8 +.IR INTERFACE_LIST " := " INTERFACE_LIST " | " interface +.SH DESCRIPTION +\fBifstat\fP neatly prints out network interface statistics. +The utility keeps records of the previous data displayed in history files and +by default only shows difference between the last and the current call. +Location of the history files defaults to /tmp/.ifstat.u$UID but may be +overridden with the IFSTAT_HISTORY environment variable. +.SH OPTIONS +.TP +.B \-h, \-\-help +Show summary of options. +.TP +.B \-V, \-\-version +Show version of program. +.TP +.B \-a, \-\-ignore +Ignore the history file. +.TP +.B \-d, \-\-scan=SECS +Sample statistics every SECS second. +.TP +.B \-e, \-\-errors +Show errors. +.TP +.B \-n, \-\-nooutput +Don't display any output. Update the history file only. +.TP +.B \-r, \-\-reset +Reset history. +.TP +.B \-s, \-\-noupdate +Don't update the history file. +.TP +.B \-t, \-\-interval=SECS +Report average over the last SECS seconds. +.TP +.B \-z, \-\-zeros +Show entries with zero activity. +.SH ENVIRONMENT +.TP +.B IFSTAT_HISTORY +If set, it's value is interpreted as alternate history file path. +.SH SEE ALSO +.BR ip (8) +.br +.SH AUTHOR +ifstat was written by Alexey Kuznetsov . +.PP +This manual page was written by Petr Sabata . From bd5bbad45007937758ec1339f2612049ce7187d7 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 29 Oct 2015 10:55:24 +0100 Subject: [PATCH 051/151] bridge: fdb: minor syntax fix in help text --- bridge/fdb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bridge/fdb.c b/bridge/fdb.c index 5ea50abb..4d109251 100644 --- a/bridge/fdb.c +++ b/bridge/fdb.c @@ -31,11 +31,11 @@ static unsigned int filter_index; static void usage(void) { - fprintf(stderr, "Usage: bridge fdb { add | append | del | replace ADDR dev DEV\n" + fprintf(stderr, "Usage: bridge fdb { add | append | del | replace } ADDR dev DEV\n" " [ self ] [ master ] [ use ] [ router ]\n" " [ local | temp ] [ dst IPADDR ] [ vlan VID ]\n" - " [ port PORT] [ vni VNI ] [via DEV]\n"); - fprintf(stderr, " bridge fdb {show} [ br BRDEV ] [ brport DEV ]\n"); + " [ port PORT] [ vni VNI ] [ via DEV ]\n"); + fprintf(stderr, " bridge fdb [ show [ br BRDEV ] [ brport DEV ] ]\n"); exit(-1); } From b5bb1820e8a7666a1cab56bf8370b338647fd7a2 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 29 Oct 2015 17:20:56 +0100 Subject: [PATCH 052/151] lib/utils: improve error messages of get_addr() and get_prefix() Instead of statically complaining about illegal inet address, use get_family() to get the address family right. Based on a patch by Hangbin Liu to print "inet6" for AF_INET6 made more generic by me. Signed-off-by: Phil Sutter --- lib/utils.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/utils.c b/lib/utils.c index 107e3f57..939a44f0 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -531,7 +531,8 @@ done: int get_addr(inet_prefix *dst, const char *arg, int family) { if (get_addr_1(dst, arg, family)) { - fprintf(stderr, "Error: an inet address is expected rather than \"%s\".\n", arg); + fprintf(stderr, "Error: %s address is expected rather than \"%s\".\n", + family_name(family) ,arg); exit(1); } return 0; @@ -544,7 +545,8 @@ int get_prefix(inet_prefix *dst, char *arg, int family) exit(1); } if (get_prefix_1(dst, arg, family)) { - fprintf(stderr, "Error: an inet prefix is expected rather than \"%s\".\n", arg); + fprintf(stderr, "Error: %s prefix is expected rather than \"%s\".\n", + family_name(family) ,arg); exit(1); } return 0; From 6720eceff7b4d4629f2c8779bce0a741f09ee97a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 3 Nov 2015 16:34:46 -0800 Subject: [PATCH 053/151] v4.3.0 --- include/SNAPSHOT.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/SNAPSHOT.h b/include/SNAPSHOT.h index 68dc6b43..bd2ccea6 100644 --- a/include/SNAPSHOT.h +++ b/include/SNAPSHOT.h @@ -1 +1 @@ -static const char SNAPSHOT[] = "150831"; +static const char SNAPSHOT[] = "151103"; From caf8875b3cb535b6cdfc52bae1e6339ef83c42a7 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Tue, 17 Nov 2015 16:08:00 +0200 Subject: [PATCH 054/151] misc/Makefile: use PKG_CONFIG Use PKG_CONFIG from Config - it works better when cross-compiling. Signed-off-by: Aaro Koskinen --- misc/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/misc/Makefile b/misc/Makefile index 389c1b04..f50e7403 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -10,8 +10,8 @@ ifeq ($(HAVE_BERKELEY_DB),y) endif ifeq ($(HAVE_SELINUX),y) - LDLIBS += $(shell pkg-config --libs libselinux) - CFLAGS += $(shell pkg-config --cflags libselinux) -DHAVE_SELINUX + LDLIBS += $(shell $(PKG_CONFIG) --libs libselinux) + CFLAGS += $(shell $(PKG_CONFIG) --cflags libselinux) -DHAVE_SELINUX endif ifeq ($(IP_CONFIG_SETNS),y) From 8de592d05c81e373028191141bdba05ee17c50c7 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 13 Nov 2015 18:08:54 +0100 Subject: [PATCH 055/151] ip{, 6}tunnel: get rid of extraneous whitespace when printing Put whitespace in the beginning of optional parts, not as suffix anywhere. Also drop double whitespaces in between words. Signed-off-by: Phil Sutter --- ip/ip6tunnel.c | 4 ++-- ip/iptunnel.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c index 9884efd4..07010d31 100644 --- a/ip/ip6tunnel.c +++ b/ip/ip6tunnel.c @@ -111,9 +111,9 @@ static void print_tunnel(struct ip6_tnl_parm2 *p) printf(" key %u", ntohl(p->i_key)); else if ((p->i_flags|p->o_flags)&GRE_KEY) { if (p->i_flags&GRE_KEY) - printf(" ikey %u ", ntohl(p->i_key)); + printf(" ikey %u", ntohl(p->i_key)); if (p->o_flags&GRE_KEY) - printf(" okey %u ", ntohl(p->o_key)); + printf(" okey %u", ntohl(p->o_key)); } if (p->i_flags&GRE_SEQ) diff --git a/ip/iptunnel.c b/ip/iptunnel.c index 78fa9885..36534f29 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -343,7 +343,7 @@ static void print_tunnel(struct ip_tunnel_parm *p) /* Do not use format_host() for local addr, * symbolic name will not be useful. */ - printf("%s: %s/ip remote %s local %s ", + printf("%s: %s/ip remote %s local %s", p->name, tnl_strproto(p->iph.protocol), p->iph.daddr ? format_host(AF_INET, 4, &p->iph.daddr, s1, sizeof(s1)) : "any", @@ -371,13 +371,13 @@ static void print_tunnel(struct ip_tunnel_parm *p) if (p->link) { const char *n = ll_index_to_name(p->link); if (n) - printf(" dev %s ", n); + printf(" dev %s", n); } if (p->iph.ttl) - printf(" ttl %d ", p->iph.ttl); + printf(" ttl %d", p->iph.ttl); else - printf(" ttl inherit "); + printf(" ttl inherit"); if (p->iph.tos) { SPRINT_BUF(b1); @@ -393,11 +393,11 @@ static void print_tunnel(struct ip_tunnel_parm *p) printf(" nopmtudisc"); if (p->iph.protocol == IPPROTO_IPV6 && !tnl_ioctl_get_6rd(p->name, &ip6rd) && ip6rd.prefixlen) { - printf(" 6rd-prefix %s/%u ", + printf(" 6rd-prefix %s/%u", inet_ntop(AF_INET6, &ip6rd.prefix, s1, sizeof(s1)), ip6rd.prefixlen); if (ip6rd.relay_prefix) { - printf("6rd-relay_prefix %s/%u ", + printf(" 6rd-relay_prefix %s/%u", format_host(AF_INET, 4, &ip6rd.relay_prefix, s1, sizeof(s1)), ip6rd.relay_prefixlen); } @@ -407,9 +407,9 @@ static void print_tunnel(struct ip_tunnel_parm *p) printf(" key %u", ntohl(p->i_key)); else if ((p->i_flags|p->o_flags)&GRE_KEY) { if (p->i_flags&GRE_KEY) - printf(" ikey %u ", ntohl(p->i_key)); + printf(" ikey %u", ntohl(p->i_key)); if (p->o_flags&GRE_KEY) - printf(" okey %u ", ntohl(p->o_key)); + printf(" okey %u", ntohl(p->o_key)); } if (p->i_flags&GRE_SEQ) From a7ed1520ee9645e31b94caf72cad47be31856745 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 13 Nov 2015 18:08:55 +0100 Subject: [PATCH 056/151] ip/tunnel: introduce tnl_parse_key() Instead of duplicating the same code six times (key, ikey and okey in iptunnel and ip6tunnel), have a common parsing routine. This has the added benefit of having the same verbose error message in ip6tunnel as well as iptunnel. I'm not sure if parsing an IPv4 address as key makes sense for ip6tunnel, but the code was there before so this patch at least doesn't make it worse. Signed-off-by: Phil Sutter --- ip/ip6tunnel.c | 33 +++------------------------------ ip/iptunnel.c | 33 +++------------------------------ ip/tunnel.c | 15 +++++++++++++++ ip/tunnel.h | 1 + 4 files changed, 22 insertions(+), 60 deletions(-) diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c index 07010d31..8b842b68 100644 --- a/ip/ip6tunnel.c +++ b/ip/ip6tunnel.c @@ -230,45 +230,18 @@ static int parse_args(int argc, char **argv, int cmd, struct ip6_tnl_parm2 *p) invarg("not inherit", *argv); p->flags |= IP6_TNL_F_RCV_DSCP_COPY; } else if (strcmp(*argv, "key") == 0) { - unsigned uval; NEXT_ARG(); p->i_flags |= GRE_KEY; p->o_flags |= GRE_KEY; - if (strchr(*argv, '.')) - p->i_key = p->o_key = get_addr32(*argv); - else { - if (get_unsigned(&uval, *argv, 0) < 0) { - fprintf(stderr, "invalid value of \"key\"\n"); - exit(-1); - } - p->i_key = p->o_key = htonl(uval); - } + p->i_key = p->o_key = tnl_parse_key("key", *argv); } else if (strcmp(*argv, "ikey") == 0) { - unsigned uval; NEXT_ARG(); p->i_flags |= GRE_KEY; - if (strchr(*argv, '.')) - p->i_key = get_addr32(*argv); - else { - if (get_unsigned(&uval, *argv, 0)<0) { - fprintf(stderr, "invalid value of \"ikey\"\n"); - exit(-1); - } - p->i_key = htonl(uval); - } + p->i_key = tnl_parse_key("ikey", *argv); } else if (strcmp(*argv, "okey") == 0) { - unsigned uval; NEXT_ARG(); p->o_flags |= GRE_KEY; - if (strchr(*argv, '.')) - p->o_key = get_addr32(*argv); - else { - if (get_unsigned(&uval, *argv, 0)<0) { - fprintf(stderr, "invalid value of \"okey\"\n"); - exit(-1); - } - p->o_key = htonl(uval); - } + p->o_key = tnl_parse_key("okey", *argv); } else if (strcmp(*argv, "seq") == 0) { p->i_flags |= GRE_SEQ; p->o_flags |= GRE_SEQ; diff --git a/ip/iptunnel.c b/ip/iptunnel.c index 36534f29..9c9dc548 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -106,45 +106,18 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) exit(-1); } } else if (strcmp(*argv, "key") == 0) { - unsigned uval; NEXT_ARG(); p->i_flags |= GRE_KEY; p->o_flags |= GRE_KEY; - if (strchr(*argv, '.')) - p->i_key = p->o_key = get_addr32(*argv); - else { - if (get_unsigned(&uval, *argv, 0)<0) { - fprintf(stderr, "invalid value for \"key\": \"%s\"; it should be an unsigned integer\n", *argv); - exit(-1); - } - p->i_key = p->o_key = htonl(uval); - } + p->i_key = p->o_key = tnl_parse_key("key", *argv); } else if (strcmp(*argv, "ikey") == 0) { - unsigned uval; NEXT_ARG(); p->i_flags |= GRE_KEY; - if (strchr(*argv, '.')) - p->i_key = get_addr32(*argv); - else { - if (get_unsigned(&uval, *argv, 0)<0) { - fprintf(stderr, "invalid value for \"ikey\": \"%s\"; it should be an unsigned integer\n", *argv); - exit(-1); - } - p->i_key = htonl(uval); - } + p->i_key = tnl_parse_key("ikey", *argv); } else if (strcmp(*argv, "okey") == 0) { - unsigned uval; NEXT_ARG(); p->o_flags |= GRE_KEY; - if (strchr(*argv, '.')) - p->o_key = get_addr32(*argv); - else { - if (get_unsigned(&uval, *argv, 0)<0) { - fprintf(stderr, "invalid value for \"okey\": \"%s\"; it should be an unsigned integer\n", *argv); - exit(-1); - } - p->o_key = htonl(uval); - } + p->o_key = tnl_parse_key("okey", *argv); } else if (strcmp(*argv, "seq") == 0) { p->i_flags |= GRE_SEQ; p->o_flags |= GRE_SEQ; diff --git a/ip/tunnel.c b/ip/tunnel.c index d69fe84d..79f2201f 100644 --- a/ip/tunnel.c +++ b/ip/tunnel.c @@ -180,3 +180,18 @@ int tnl_ioctl_get_6rd(const char *name, void *p) { return tnl_gen_ioctl(SIOCGET6RD, name, p, EINVAL); } + +__be32 tnl_parse_key(const char *name, const char *key) +{ + unsigned uval; + + if (strchr(key, '.')) + return get_addr32(key); + + if (get_unsigned(&uval, key, 0) < 0) { + fprintf(stderr, "invalid value for \"%s\": \"%s\";", name, key); + fprintf(stderr, " it should be an unsigned integer\n"); + exit(-1); + } + return htonl(uval); +} diff --git a/ip/tunnel.h b/ip/tunnel.h index 9c2f5d29..9fb4a186 100644 --- a/ip/tunnel.h +++ b/ip/tunnel.h @@ -31,5 +31,6 @@ int tnl_del_ioctl(const char *basedev, const char *name, void *p); int tnl_prl_ioctl(int cmd, const char *name, void *p); int tnl_6rd_ioctl(int cmd, const char *name, void *p); int tnl_ioctl_get_6rd(const char *name, void *p); +__be32 tnl_parse_key(const char *name, const char *key); #endif From 6ddb1e8c900668a22aaaf8d5f3b1586c0e88efcd Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 13 Nov 2015 18:08:56 +0100 Subject: [PATCH 057/151] ip{, 6}tunnel: unify behaviour if physical device is not found Make ip6tunnel print an error message as well. While there, get rid of unnecessary line breaking. Signed-off-by: Phil Sutter --- ip/ip6tunnel.c | 4 +++- ip/iptunnel.c | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c index 8b842b68..410276f1 100644 --- a/ip/ip6tunnel.c +++ b/ip/ip6tunnel.c @@ -278,8 +278,10 @@ static int parse_args(int argc, char **argv, int cmd, struct ip6_tnl_parm2 *p) } if (medium[0]) { p->link = ll_name_to_index(medium); - if (p->link == 0) + if (p->link == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", medium); return -1; + } } return 0; } diff --git a/ip/iptunnel.c b/ip/iptunnel.c index 9c9dc548..803bb832 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -228,8 +228,7 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) if (medium[0]) { p->link = if_nametoindex(medium); if (p->link == 0) { - fprintf(stderr, "Cannot find device \"%s\"\n", - medium); + fprintf(stderr, "Cannot find device \"%s\"\n", medium); return -1; } } From 4b3cb96281ebf39fa63734c7879fbf078d1bfa7b Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 13 Nov 2015 18:08:57 +0100 Subject: [PATCH 058/151] iptunnel: use ll_name_to_index() for physical interface lookup Although the cache is only initialized in do_show(), this way it is at least consistent with ip6tunnel. Signed-off-by: Phil Sutter --- ip/iptunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/iptunnel.c b/ip/iptunnel.c index 803bb832..a5478529 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -226,7 +226,7 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) } if (medium[0]) { - p->link = if_nametoindex(medium); + p->link = ll_name_to_index(medium); if (p->link == 0) { fprintf(stderr, "Cannot find device \"%s\"\n", medium); return -1; From c4527d7ba36edd214d5e1a700affd5bf89c5cecc Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 13 Nov 2015 18:08:58 +0100 Subject: [PATCH 059/151] ip{,6}tunnel: align do_tunnels_list() a bit In iptunnel, declare loop variables inside the loop as done in ip6tunnel. Fix and simplify goto logic in ip6tunnel: - Failure to read over header lines would have left fp opened. - By returning directly upon fopen() failure, fp can be closed unconditionally in the end. Use the same goto logic in iptunnel, as well. Signed-off-by: Phil Sutter --- ip/ip6tunnel.c | 8 +++----- ip/iptunnel.c | 25 +++++++++++++------------ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c index 410276f1..ba92518a 100644 --- a/ip/ip6tunnel.c +++ b/ip/ip6tunnel.c @@ -326,14 +326,14 @@ static int do_tunnels_list(struct ip6_tnl_parm2 *p) FILE *fp = fopen("/proc/net/dev", "r"); if (fp == NULL) { perror("fopen"); - goto end; + return -1; } /* skip two lines at the begenning of the file */ if (!fgets(buf, sizeof(buf), fp) || !fgets(buf, sizeof(buf), fp)) { fprintf(stderr, "/proc/net/dev read error\n"); - return -1; + goto end; } while (fgets(buf, sizeof(buf), fp) != NULL) { @@ -395,10 +395,8 @@ static int do_tunnels_list(struct ip6_tnl_parm2 *p) printf("\n"); } err = 0; - end: - if (fp) - fclose(fp); + fclose(fp); return err; } diff --git a/ip/iptunnel.c b/ip/iptunnel.c index a5478529..e323c1f7 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -396,14 +396,8 @@ static void print_tunnel(struct ip_tunnel_parm *p) static int do_tunnels_list(struct ip_tunnel_parm *p) { - char name[IFNAMSIZ]; - unsigned long rx_bytes, rx_packets, rx_errs, rx_drops, - rx_fifo, rx_frame, - tx_bytes, tx_packets, tx_errs, tx_drops, - tx_fifo, tx_colls, tx_carrier, rx_multi; - struct ip_tunnel_parm p1; - char buf[512]; + int err = -1; FILE *fp = fopen("/proc/net/dev", "r"); if (fp == NULL) { perror("fopen"); @@ -414,19 +408,24 @@ static int do_tunnels_list(struct ip_tunnel_parm *p) if (!fgets(buf, sizeof(buf), fp) || !fgets(buf, sizeof(buf), fp)) { fprintf(stderr, "/proc/net/dev read error\n"); - fclose(fp); - return -1; + goto end; } while (fgets(buf, sizeof(buf), fp) != NULL) { + char name[IFNAMSIZ]; int index, type; + unsigned long rx_bytes, rx_packets, rx_errs, rx_drops, + rx_fifo, rx_frame, + tx_bytes, tx_packets, tx_errs, tx_drops, + tx_fifo, tx_colls, tx_carrier, rx_multi; + struct ip_tunnel_parm p1; char *ptr; + buf[sizeof(buf) - 1] = 0; if ((ptr = strchr(buf, ':')) == NULL || (*ptr++ = 0, sscanf(buf, "%s", name) != 1)) { fprintf(stderr, "Wrong format for /proc/net/dev. Giving up.\n"); - fclose(fp); - return -1; + goto end; } if (sscanf(ptr, "%ld%ld%ld%ld%ld%ld%ld%*d%ld%ld%ld%ld%ld%ld%ld", &rx_bytes, &rx_packets, &rx_errs, &rx_drops, @@ -467,8 +466,10 @@ static int do_tunnels_list(struct ip_tunnel_parm *p) } printf("\n"); } + err = 0; + end: fclose(fp); - return 0; + return err; } static int do_show(int argc, char **argv) From 9af72f819e3fa288fd56e74d14a1253bd49adb9d Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 13 Nov 2015 18:08:59 +0100 Subject: [PATCH 060/151] ip6tunnel: print local/remote addresses like iptunnel does This makes output consistent with iptunnel, also supporting reverse DNS lookup for remote address if requested. Signed-off-by: Phil Sutter --- ip/ip6tunnel.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c index ba92518a..9eb5b2f1 100644 --- a/ip/ip6tunnel.c +++ b/ip/ip6tunnel.c @@ -68,14 +68,17 @@ static void usage(void) static void print_tunnel(struct ip6_tnl_parm2 *p) { - char remote[64]; - char local[64]; - - inet_ntop(AF_INET6, &p->raddr, remote, sizeof(remote)); - inet_ntop(AF_INET6, &p->laddr, local, sizeof(local)); + char s1[1024]; + char s2[1024]; + /* Do not use format_host() for local addr, + * symbolic name will not be useful. + */ printf("%s: %s/ipv6 remote %s local %s", - p->name, tnl_strproto(p->proto), remote, local); + p->name, + tnl_strproto(p->proto), + format_host(AF_INET6, 16, &p->raddr, s1, sizeof(s1)), + rt_addr_n2a(AF_INET6, 16, &p->laddr, s2, sizeof(s2))); if (p->link) { const char *n = ll_index_to_name(p->link); if (n) From 7894ce7722773b9bfbdb1097218a37e13494a927 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 13 Nov 2015 18:09:00 +0100 Subject: [PATCH 061/151] ip6tunnel: fix coding style: no newline between brace and else Signed-off-by: Phil Sutter --- ip/ip6tunnel.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c index 9eb5b2f1..d8957f0e 100644 --- a/ip/ip6tunnel.c +++ b/ip/ip6tunnel.c @@ -262,8 +262,7 @@ static int parse_args(int argc, char **argv, int cmd, struct ip6_tnl_parm2 *p) } else { if (strcmp(*argv, "name") == 0) { NEXT_ARG(); - } - else if (matches(*argv, "help") == 0) + } else if (matches(*argv, "help") == 0) usage(); if (p->name[0]) duparg2("name", *argv); From 2520598a1aa846c51d023714f7e713cd6e2dd56b Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 13 Nov 2015 18:09:01 +0100 Subject: [PATCH 062/151] iptunnel: share common code when setting tunnel mode Signed-off-by: Phil Sutter --- ip/iptunnel.c | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/ip/iptunnel.c b/ip/iptunnel.c index e323c1f7..92edb34b 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -47,6 +47,15 @@ static void usage(void) exit(-1); } +static void set_tunnel_proto(struct ip_tunnel_parm *p, int proto) +{ + if (p->iph.protocol && p->iph.protocol != proto) { + fprintf(stderr,"You managed to ask for more than one tunnel mode.\n"); + exit(-1); + } + p->iph.protocol = proto; +} + static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) { int count = 0; @@ -68,38 +77,18 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) NEXT_ARG(); if (strcmp(*argv, "ipip") == 0 || strcmp(*argv, "ip/ip") == 0) { - if (p->iph.protocol && p->iph.protocol != IPPROTO_IPIP) { - fprintf(stderr,"You managed to ask for more than one tunnel mode.\n"); - exit(-1); - } - p->iph.protocol = IPPROTO_IPIP; + set_tunnel_proto(p, IPPROTO_IPIP); } else if (strcmp(*argv, "gre") == 0 || strcmp(*argv, "gre/ip") == 0) { - if (p->iph.protocol && p->iph.protocol != IPPROTO_GRE) { - fprintf(stderr,"You managed to ask for more than one tunnel mode.\n"); - exit(-1); - } - p->iph.protocol = IPPROTO_GRE; + set_tunnel_proto(p, IPPROTO_GRE); } else if (strcmp(*argv, "sit") == 0 || strcmp(*argv, "ipv6/ip") == 0) { - if (p->iph.protocol && p->iph.protocol != IPPROTO_IPV6) { - fprintf(stderr,"You managed to ask for more than one tunnel mode.\n"); - exit(-1); - } - p->iph.protocol = IPPROTO_IPV6; + set_tunnel_proto(p, IPPROTO_IPV6); } else if (strcmp(*argv, "isatap") == 0) { - if (p->iph.protocol && p->iph.protocol != IPPROTO_IPV6) { - fprintf(stderr, "You managed to ask for more than one tunnel mode.\n"); - exit(-1); - } - p->iph.protocol = IPPROTO_IPV6; + set_tunnel_proto(p, IPPROTO_IPV6); isatap++; } else if (strcmp(*argv, "vti") == 0) { - if (p->iph.protocol && p->iph.protocol != IPPROTO_IPIP) { - fprintf(stderr, "You managed to ask for more than one tunnel mode.\n"); - exit(-1); - } - p->iph.protocol = IPPROTO_IPIP; + set_tunnel_proto(p, IPPROTO_IPIP); p->i_flags |= VTI_ISVTI; } else { fprintf(stderr,"Unknown tunnel mode \"%s\"\n", *argv); From 0dd4d2b37fb4f88518f73e4635435e918093397f Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 13 Nov 2015 18:09:02 +0100 Subject: [PATCH 063/151] iptunnel: simplify parsing TTL, allow 'hlim' as identifier Instead of parsing an unsigned integer and checking boundaries, simply parse u8. This and the added ttl alias 'hlim' provide consistency with ip6tunnel. Signed-off-by: Phil Sutter --- ip/iptunnel.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ip/iptunnel.c b/ip/iptunnel.c index 92edb34b..8c05f6fe 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -141,14 +141,13 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) NEXT_ARG(); strncpy(medium, *argv, IFNAMSIZ-1); } else if (strcmp(*argv, "ttl") == 0 || - strcmp(*argv, "hoplimit") == 0) { - unsigned uval; + strcmp(*argv, "hoplimit") == 0 || + strcmp(*argv, "hlim") == 0) { + __u8 uval; NEXT_ARG(); if (strcmp(*argv, "inherit") != 0) { - if (get_unsigned(&uval, *argv, 0)) + if (get_u8(&uval, *argv, 0)) invarg("invalid TTL\n", *argv); - if (uval > 255) - invarg("TTL must be <=255\n", *argv); p->iph.ttl = uval; } } else if (strcmp(*argv, "tos") == 0 || From c957821b186e799fc86ae82c7d153b598a7a9c61 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 13 Nov 2015 18:09:03 +0100 Subject: [PATCH 064/151] iptunnel: share common code when determining the default interface name Signed-off-by: Phil Sutter --- ip/iptunnel.c | 70 ++++++++++++++++++--------------------------------- 1 file changed, 25 insertions(+), 45 deletions(-) diff --git a/ip/iptunnel.c b/ip/iptunnel.c index 8c05f6fe..3b46a159 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -239,10 +239,26 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) return 0; } +static const char *tnl_defname(const struct ip_tunnel_parm *p) +{ + switch (p->iph.protocol) { + case IPPROTO_IPIP: + if (p->i_flags & VTI_ISVTI) + return "ip_vti0"; + else + return "tunl0"; + case IPPROTO_GRE: + return "gre0"; + case IPPROTO_IPV6: + return "sit0"; + } + return NULL; +} static int do_add(int cmd, int argc, char **argv) { struct ip_tunnel_parm p; + const char *basedev; if (parse_args(argc, argv, cmd, &p) < 0) return -1; @@ -252,21 +268,12 @@ static int do_add(int cmd, int argc, char **argv) return -1; } - switch (p.iph.protocol) { - case IPPROTO_IPIP: - if (p.i_flags & VTI_ISVTI) - return tnl_add_ioctl(cmd, "ip_vti0", p.name, &p); - else - return tnl_add_ioctl(cmd, "tunl0", p.name, &p); - case IPPROTO_GRE: - return tnl_add_ioctl(cmd, "gre0", p.name, &p); - case IPPROTO_IPV6: - return tnl_add_ioctl(cmd, "sit0", p.name, &p); - default: + if (!(basedev = tnl_defname(&p))) { fprintf(stderr, "cannot determine tunnel mode (ipip, gre, vti or sit)\n"); return -1; } - return -1; + + return tnl_add_ioctl(cmd, basedev, p.name, &p); } static int do_del(int argc, char **argv) @@ -276,20 +283,7 @@ static int do_del(int argc, char **argv) if (parse_args(argc, argv, SIOCDELTUNNEL, &p) < 0) return -1; - switch (p.iph.protocol) { - case IPPROTO_IPIP: - if (p.i_flags & VTI_ISVTI) - return tnl_del_ioctl("ip_vti0", p.name, &p); - else - return tnl_del_ioctl("tunl0", p.name, &p); - case IPPROTO_GRE: - return tnl_del_ioctl("gre0", p.name, &p); - case IPPROTO_IPV6: - return tnl_del_ioctl("sit0", p.name, &p); - default: - return tnl_del_ioctl(p.name, p.name, &p); - } - return -1; + return tnl_del_ioctl(tnl_defname(&p) ? : p.name, p.name, &p); } static void print_tunnel(struct ip_tunnel_parm *p) @@ -462,31 +456,17 @@ static int do_tunnels_list(struct ip_tunnel_parm *p) static int do_show(int argc, char **argv) { - int err; struct ip_tunnel_parm p; + const char *basedev; ll_init_map(&rth); if (parse_args(argc, argv, SIOCGETTUNNEL, &p) < 0) return -1; - switch (p.iph.protocol) { - case IPPROTO_IPIP: - if (p.i_flags & VTI_ISVTI) - err = tnl_get_ioctl(p.name[0] ? p.name : "ip_vti0", &p); - else - err = tnl_get_ioctl(p.name[0] ? p.name : "tunl0", &p); - break; - case IPPROTO_GRE: - err = tnl_get_ioctl(p.name[0] ? p.name : "gre0", &p); - break; - case IPPROTO_IPV6: - err = tnl_get_ioctl(p.name[0] ? p.name : "sit0", &p); - break; - default: - do_tunnels_list(&p); - return 0; - } - if (err) + if (!(basedev = tnl_defname(&p))) + return do_tunnels_list(&p); + + if (tnl_get_ioctl(p.name[0] ? p.name : basedev, &p)) return -1; print_tunnel(&p); From f53ecee818280961d86c4ffb07c367144b91490d Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 13 Nov 2015 18:09:04 +0100 Subject: [PATCH 065/151] iptunnel: sanitize copying tunnel name Since p->name is only IFNAMSIZ bytes, do not copy more than IFNAMSIZ - 1 bytes into it so there remains at least a single null byte in the end. Signed-off-by: Phil Sutter --- ip/iptunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/iptunnel.c b/ip/iptunnel.c index 3b46a159..b377a5b4 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -175,7 +175,7 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) usage(); if (p->name[0]) duparg2("name", *argv); - strncpy(p->name, *argv, IFNAMSIZ); + strncpy(p->name, *argv, IFNAMSIZ - 1); if (cmd == SIOCCHGTUNNEL && count == 0) { struct ip_tunnel_parm old_p; memset(&old_p, 0, sizeof(old_p)); From 04ce8d3edaa1d6021921a074fa90fc9db4d3f6b7 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 13 Nov 2015 18:09:05 +0100 Subject: [PATCH 066/151] ip{,6}tunnel: put spaces around non-unary operators Signed-off-by: Phil Sutter --- ip/ip6tunnel.c | 16 ++++++++-------- ip/iptunnel.c | 40 ++++++++++++++++++++-------------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c index d8957f0e..320d2539 100644 --- a/ip/ip6tunnel.c +++ b/ip/ip6tunnel.c @@ -110,22 +110,22 @@ static void print_tunnel(struct ip6_tnl_parm2 *p) printf(" dscp inherit"); if (p->proto == IPPROTO_GRE) { - if ((p->i_flags&GRE_KEY) && (p->o_flags&GRE_KEY) && p->o_key == p->i_key) + if ((p->i_flags & GRE_KEY) && (p->o_flags & GRE_KEY) && p->o_key == p->i_key) printf(" key %u", ntohl(p->i_key)); - else if ((p->i_flags|p->o_flags)&GRE_KEY) { - if (p->i_flags&GRE_KEY) + else if ((p->i_flags | p->o_flags) & GRE_KEY) { + if (p->i_flags & GRE_KEY) printf(" ikey %u", ntohl(p->i_key)); - if (p->o_flags&GRE_KEY) + if (p->o_flags & GRE_KEY) printf(" okey %u", ntohl(p->o_key)); } - if (p->i_flags&GRE_SEQ) + if (p->i_flags & GRE_SEQ) printf("%s Drop packets out of sequence.", _SL_); - if (p->i_flags&GRE_CSUM) + if (p->i_flags & GRE_CSUM) printf("%s Checksum in received packet is required.", _SL_); - if (p->o_flags&GRE_SEQ) + if (p->o_flags & GRE_SEQ) printf("%s Sequence packets on output.", _SL_); - if (p->o_flags&GRE_CSUM) + if (p->o_flags & GRE_CSUM) printf("%s Checksum output packets.", _SL_); } } diff --git a/ip/iptunnel.c b/ip/iptunnel.c index b377a5b4..b9552edc 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -139,7 +139,7 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) p->iph.saddr = htonl(INADDR_ANY); } else if (strcmp(*argv, "dev") == 0) { NEXT_ARG(); - strncpy(medium, *argv, IFNAMSIZ-1); + strncpy(medium, *argv, IFNAMSIZ - 1); } else if (strcmp(*argv, "ttl") == 0 || strcmp(*argv, "hoplimit") == 0 || strcmp(*argv, "hlim") == 0) { @@ -336,14 +336,14 @@ static void print_tunnel(struct ip_tunnel_parm *p) if (p->iph.tos) { SPRINT_BUF(b1); printf(" tos"); - if (p->iph.tos&1) + if (p->iph.tos & 1) printf(" inherit"); - if (p->iph.tos&~1) - printf("%c%s ", p->iph.tos&1 ? '/' : ' ', - rtnl_dsfield_n2a(p->iph.tos&~1, b1, sizeof(b1))); + if (p->iph.tos & ~1) + printf("%c%s ", p->iph.tos & 1 ? '/' : ' ', + rtnl_dsfield_n2a(p->iph.tos & ~1, b1, sizeof(b1))); } - if (!(p->iph.frag_off&htons(IP_DF))) + if (!(p->iph.frag_off & htons(IP_DF))) printf(" nopmtudisc"); if (p->iph.protocol == IPPROTO_IPV6 && !tnl_ioctl_get_6rd(p->name, &ip6rd) && ip6rd.prefixlen) { @@ -357,22 +357,22 @@ static void print_tunnel(struct ip_tunnel_parm *p) } } - if ((p->i_flags&GRE_KEY) && (p->o_flags&GRE_KEY) && p->o_key == p->i_key) + if ((p->i_flags & GRE_KEY) && (p->o_flags & GRE_KEY) && p->o_key == p->i_key) printf(" key %u", ntohl(p->i_key)); - else if ((p->i_flags|p->o_flags)&GRE_KEY) { - if (p->i_flags&GRE_KEY) + else if ((p->i_flags | p->o_flags) & GRE_KEY) { + if (p->i_flags & GRE_KEY) printf(" ikey %u", ntohl(p->i_key)); - if (p->o_flags&GRE_KEY) + if (p->o_flags & GRE_KEY) printf(" okey %u", ntohl(p->o_key)); } - if (p->i_flags&GRE_SEQ) + if (p->i_flags & GRE_SEQ) printf("%s Drop packets out of sequence.", _SL_); - if (p->i_flags&GRE_CSUM) + if (p->i_flags & GRE_CSUM) printf("%s Checksum in received packet is required.", _SL_); - if (p->o_flags&GRE_SEQ) + if (p->o_flags & GRE_SEQ) printf("%s Sequence packets on output.", _SL_); - if (p->o_flags&GRE_CSUM) + if (p->o_flags & GRE_CSUM) printf("%s Checksum output packets.", _SL_); } @@ -592,19 +592,19 @@ int do_iptunnel(int argc, char **argv) if (argc > 0) { if (matches(*argv, "add") == 0) - return do_add(SIOCADDTUNNEL, argc-1, argv+1); + return do_add(SIOCADDTUNNEL, argc - 1, argv + 1); if (matches(*argv, "change") == 0) - return do_add(SIOCCHGTUNNEL, argc-1, argv+1); + return do_add(SIOCCHGTUNNEL, argc - 1, argv + 1); if (matches(*argv, "delete") == 0) - return do_del(argc-1, argv+1); + return do_del(argc - 1, argv + 1); if (matches(*argv, "show") == 0 || matches(*argv, "lst") == 0 || matches(*argv, "list") == 0) - return do_show(argc-1, argv+1); + return do_show(argc - 1, argv + 1); if (matches(*argv, "prl") == 0) - return do_prl(argc-1, argv+1); + return do_prl(argc - 1, argv + 1); if (matches(*argv, "6rd") == 0) - return do_6rd(argc-1, argv+1); + return do_6rd(argc - 1, argv + 1); if (matches(*argv, "help") == 0) usage(); } else From 85e3c87c824e256ac182b15d3993e8526e1f8608 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Skytt=C3=A4?= Date: Sat, 7 Nov 2015 11:52:59 +0200 Subject: [PATCH 067/151] man: Syntax and warning fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix syntax issues and warnings highlighted by `man --warnings=w' from man-db 2.7.1. Signed-off-by: Ville Skyttä --- man/man8/tc-bpf.8 | 2 +- man/man8/tipc-bearer.8 | 4 ++-- man/man8/tipc-link.8 | 6 +++--- man/man8/tipc-media.8 | 4 ++-- man/man8/tipc-nametable.8 | 4 ++-- man/man8/tipc-node.8 | 4 ++-- man/man8/tipc-socket.8 | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/man/man8/tc-bpf.8 b/man/man8/tc-bpf.8 index 2c02ab29..f5201d36 100644 --- a/man/man8/tc-bpf.8 +++ b/man/man8/tc-bpf.8 @@ -844,7 +844,7 @@ result in the default classid: Basically, such a minimal generator is equivalent to: .in +4n -.B tcpdump -iem1 -ddd 'tcp[tcpflags] & tcp-syn != 0' | tr '\\n' ',' > /var/bpf/tcp-syn +.B tcpdump -iem1 -ddd 'tcp[tcpflags] & tcp-syn != 0' | tr '\\\\n' ',' > /var/bpf/tcp-syn .in Since diff --git a/man/man8/tipc-bearer.8 b/man/man8/tipc-bearer.8 index f59c39d2..50a1ed24 100644 --- a/man/man8/tipc-bearer.8 +++ b/man/man8/tipc-bearer.8 @@ -1,7 +1,7 @@ .TH TIPC-BEARER 8 "02 Jun 2015" "iproute2" "Linux" -./ For consistency, please keep padding right aligned. -./ For example '.B "foo " bar' and not '.B foo " bar"' +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' .SH NAME tipc-bearer \- show or modify TIPC bearers diff --git a/man/man8/tipc-link.8 b/man/man8/tipc-link.8 index 899b8825..3be8c9ad 100644 --- a/man/man8/tipc-link.8 +++ b/man/man8/tipc-link.8 @@ -1,7 +1,7 @@ .TH TIPC-LINK 8 "02 Jun 2015" "iproute2" "Linux" -./ For consistency, please keep padding right aligned. -./ For example '.B "foo " bar' and not '.B foo " bar"' +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' .SH NAME tipc-link \- show links or modify link properties @@ -33,7 +33,7 @@ tipc-link \- show links or modify link properties .I LINK .RB "] | " "reset .BI "link " "LINK " -.R } +} .ti -8 .B tipc link list diff --git a/man/man8/tipc-media.8 b/man/man8/tipc-media.8 index 7f94efec..6c6e2b15 100644 --- a/man/man8/tipc-media.8 +++ b/man/man8/tipc-media.8 @@ -1,7 +1,7 @@ .TH TIPC-MEDIA 8 "02 Jun 2015" "iproute2" "Linux" -./ For consistency, please keep padding right aligned. -./ For example '.B "foo " bar' and not '.B foo " bar"' +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' .SH NAME tipc-media \- list or modify media properties diff --git a/man/man8/tipc-nametable.8 b/man/man8/tipc-nametable.8 index c8d573f3..d3397f97 100644 --- a/man/man8/tipc-nametable.8 +++ b/man/man8/tipc-nametable.8 @@ -1,7 +1,7 @@ .TH TIPC-NAMETABLE 8 "02 Jun 2015" "iproute2" "Linux" -./ For consistency, please keep padding right aligned. -./ For example '.B "foo " bar' and not '.B foo " bar"' +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' .SH NAME tipc-nametable \- show TIPC nametable diff --git a/man/man8/tipc-node.8 b/man/man8/tipc-node.8 index 66418b35..ef32ec7c 100644 --- a/man/man8/tipc-node.8 +++ b/man/man8/tipc-node.8 @@ -1,7 +1,7 @@ .TH TIPC-NODE 8 "02 Jun 2015" "iproute2" "Linux" -./ For consistency, please keep padding right aligned. -./ For example '.B "foo " bar' and not '.B foo " bar"' +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' .SH NAME tipc-node \- modify and show local node parameters or list peer nodes diff --git a/man/man8/tipc-socket.8 b/man/man8/tipc-socket.8 index af18e35b..23ec1e51 100644 --- a/man/man8/tipc-socket.8 +++ b/man/man8/tipc-socket.8 @@ -1,7 +1,7 @@ .TH TIPC-SOCKET 8 "02 Jun 2015" "iproute2" "Linux" -./ For consistency, please keep padding right aligned. -./ For example '.B "foo " bar' and not '.B foo " bar"' +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' .SH NAME tipc-socket \- show TIPC socket (port) information From ac0817ef6612a0ffb7d2b99891d6f7db48d33a72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Skytt=C3=A4?= Date: Sat, 7 Nov 2015 11:53:00 +0200 Subject: [PATCH 068/151] man: Spelling fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ville Skyttä --- man/man8/bridge.8 | 2 +- man/man8/ifcfg.8 | 2 +- man/man8/lnstat.8 | 2 +- man/man8/tc-bpf.8 | 2 +- man/man8/tc-cbq-details.8 | 2 +- man/man8/tc-mqprio.8 | 2 +- man/man8/tc.8 | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index 222a4381..deccb1cc 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -237,7 +237,7 @@ state the port for list for STP BPDUs and drop all other traffic. .B 2 - STP LEARNING state. Only valid if STP is enabled on the bridge. In this state the port will accept traffic only for the purpose of updating MAC -adress tables. +address tables. .sp .B 3 diff --git a/man/man8/ifcfg.8 b/man/man8/ifcfg.8 index 79033bde..1a3786c1 100644 --- a/man/man8/ifcfg.8 +++ b/man/man8/ifcfg.8 @@ -1,6 +1,6 @@ .TH IFCFG 8 "September 24 2009" "iproute2" "Linux" .SH NAME -ifcfg \- simplistic script which replaces ifconfig IP managment +ifcfg \- simplistic script which replaces ifconfig IP management .SH SYNOPSIS .ad l .in +8 diff --git a/man/man8/lnstat.8 b/man/man8/lnstat.8 index 69fe876f..acd5f4a2 100644 --- a/man/man8/lnstat.8 +++ b/man/man8/lnstat.8 @@ -172,7 +172,7 @@ Number of dropped conntrack entries to make room for new ones, if maximum table size was reached. .sp .B icmp_error -Number of packets wich could not be tracked due to error situation. This is a +Number of packets which could not be tracked due to error situation. This is a subset of \fBinvalid\fP. .sp .B expect_new diff --git a/man/man8/tc-bpf.8 b/man/man8/tc-bpf.8 index f5201d36..c8d5c5f9 100644 --- a/man/man8/tc-bpf.8 +++ b/man/man8/tc-bpf.8 @@ -394,7 +394,7 @@ socket, and spawning an application such as \&. This approach's advantage is that tc will place the file descriptors into the environment and thus make them available just like stdin, stdout, stderr file descriptors, meaning, in case user applications run from within -this fd-owner shell, they can terminate and restart without loosing eBPF +this fd-owner shell, they can terminate and restart without losing eBPF maps file descriptors. Example invocation with the previous classifier and action mixture: diff --git a/man/man8/tc-cbq-details.8 b/man/man8/tc-cbq-details.8 index ddaf3ca7..86353b58 100644 --- a/man/man8/tc-cbq-details.8 +++ b/man/man8/tc-cbq-details.8 @@ -197,7 +197,7 @@ priority yielded a class, enter the fallback algorithm. The fallback algorithm resides outside of the loop and is as follows. .TP (i) -Consult the defmap of the class at which the jump to fallback occured. If +Consult the defmap of the class at which the jump to fallback occurred. If the defmap contains a class for the .B priority diff --git a/man/man8/tc-mqprio.8 b/man/man8/tc-mqprio.8 index da3bf089..0e1d305d 100644 --- a/man/man8/tc-mqprio.8 +++ b/man/man8/tc-mqprio.8 @@ -85,7 +85,7 @@ belong to an application. See kernel and cgroup documentation for details. .SH QDISC PARAMETERS .TP num_tc -Number of traffic classes to use upto 16 classes supported. +Number of traffic classes to use. Up to 16 classes supported. .TP map diff --git a/man/man8/tc.8 b/man/man8/tc.8 index 700b960c..7a1090b9 100644 --- a/man/man8/tc.8 +++ b/man/man8/tc.8 @@ -609,7 +609,7 @@ to .TP .BR "\-cf" , " \-conf " -specifies path to the config file. This option is used in conjuction with other options (e.g. +specifies path to the config file. This option is used in conjunction with other options (e.g. .BR -nm ")." .SH FORMAT From 5699275b424c637edc60f46c89071d4e8f4edde3 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 23 Nov 2015 15:41:37 -0800 Subject: [PATCH 069/151] man8: scrub trailing whitespace Remove extraneous whitespace --- man/man8/bridge.8 | 6 +- man/man8/ip-addrlabel.8 | 5 +- man/man8/ip-neighbour.8 | 4 +- man/man8/ip-ntable.8 | 4 +- man/man8/ip-rule.8 | 2 +- man/man8/ip-xfrm.8 | 2 +- man/man8/ip.8 | 2 +- man/man8/routel.8 | 20 ++--- man/man8/rtacct.8 | 1 - man/man8/rtmon.8 | 8 +- man/man8/ss.8 | 6 +- man/man8/tc-bfifo.8 | 32 ++++--- man/man8/tc-cbq-details.8 | 178 +++++++++++++++++++------------------- man/man8/tc-cbq.8 | 158 +++++++++++++++++---------------- man/man8/tc-drr.8 | 1 - man/man8/tc-htb.8 | 80 +++++++++-------- man/man8/tc-netem.8 | 22 ++--- man/man8/tc-pfifo_fast.8 | 14 ++- man/man8/tc-prio.8 | 28 +++--- man/man8/tc-red.8 | 62 +++++++------ man/man8/tc-sfq.8 | 40 ++++----- man/man8/tc-tbf.8 | 48 +++++----- man/man8/tc.8 | 1 - 23 files changed, 351 insertions(+), 373 deletions(-) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index deccb1cc..c9a550dd 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -91,7 +91,7 @@ bridge \- show / manipulate bridge addresses and devices .BR "bridge vlan" " { " add " | " del " } " .B dev .IR DEV -.B vid +.B vid .IR VID " [ " .BR pvid " ] [ " untagged " ] [ " .BR self " ] [ " master " ] " @@ -159,7 +159,7 @@ return code will be non zero. - Bridge port. .TP -.B fdb +.B fdb - Forwarding Database entry. .TP @@ -384,7 +384,7 @@ If omitted the default value is used. .BI via " DEVICE" device name of the outgoing interface for the VXLAN device driver to reach the -remote VXLAN tunnel endpoint. +remote VXLAN tunnel endpoint. .SS bridge fdb append - append a forwarding database entry This command adds a new fdb entry with an already known diff --git a/man/man8/ip-addrlabel.8 b/man/man8/ip-addrlabel.8 index 5fc18fec..51ef5727 100644 --- a/man/man8/ip-addrlabel.8 +++ b/man/man8/ip-addrlabel.8 @@ -7,8 +7,8 @@ ip-addrlabel \- protocol address label management .in +8 .ti -8 .B ip -.RI "[ " OPTIONS " ]" -.B addrlabel +.RI "[ " OPTIONS " ]" +.B addrlabel .RI " { " COMMAND " | " .BR help " }" .sp @@ -66,4 +66,3 @@ flush all address labels in the kernel. This does not restore any default settin .SH AUTHOR Manpage by Yoshifuji Hideaki / å‰è—¤è‹±æ˜Ž - diff --git a/man/man8/ip-neighbour.8 b/man/man8/ip-neighbour.8 index b0fc0dd8..c9b0256e 100644 --- a/man/man8/ip-neighbour.8 +++ b/man/man8/ip-neighbour.8 @@ -33,9 +33,9 @@ ip-neighbour \- neighbour/arp tables management. .SH DESCRIPTION -The +The .B ip neigh -command manipulates +command manipulates .I neighbour objects that establish bindings between protocol addresses and link layer addresses for hosts sharing the same link. diff --git a/man/man8/ip-ntable.8 b/man/man8/ip-ntable.8 index d903a170..462e5896 100644 --- a/man/man8/ip-ntable.8 +++ b/man/man8/ip-ntable.8 @@ -55,7 +55,7 @@ ip-ntable - neighbour table configuration .SH DESCRIPTION .I ip ntable -controls the parameters for the neighbour tables. +controls the parameters for the neighbour tables. .SS ip ntable show - list the ip neighbour tables @@ -98,4 +98,4 @@ default value (3) to 8 packets. .BR ip (8) .SH AUTHOR -Manpage by Stephen Hemminger +Manpage by Stephen Hemminger diff --git a/man/man8/ip-rule.8 b/man/man8/ip-rule.8 index b1d03e79..b7008c6a 100644 --- a/man/man8/ip-rule.8 +++ b/man/man8/ip-rule.8 @@ -62,7 +62,7 @@ ip-rule \- routing policy database management .SH DESCRIPTION .I ip rule -manipulates rules +manipulates rules in the routing policy database control the route selection algorithm. .P diff --git a/man/man8/ip-xfrm.8 b/man/man8/ip-xfrm.8 index 489ab6ed..dae07288 100644 --- a/man/man8/ip-xfrm.8 +++ b/man/man8/ip-xfrm.8 @@ -121,7 +121,7 @@ ip-xfrm \- transform configuration .ti -8 .IR ALGO " :=" -.RB "{ " enc " | " auth " } " +.RB "{ " enc " | " auth " } " .IR ALGO-NAME " " ALGO-KEYMAT " |" .br .B auth-trunc diff --git a/man/man8/ip.8 b/man/man8/ip.8 index 1bdee118..b1f69073 100644 --- a/man/man8/ip.8 +++ b/man/man8/ip.8 @@ -12,7 +12,7 @@ ip \- show / manipulate routing, devices, policy routing and tunnels .sp .ti -8 -.B ip +.B ip .RB "[ " -force " ] " .BI "-batch " filename .sp diff --git a/man/man8/routel.8 b/man/man8/routel.8 index cdf8f55b..82d580fb 100644 --- a/man/man8/routel.8 +++ b/man/man8/routel.8 @@ -1,16 +1,16 @@ .TH "ROUTEL" "8" "3 Jan, 2008" "iproute2" "Linux" .SH "NAME" -.LP +.LP routel \- list routes with pretty output format .br routef \- flush routes .SH "SYNTAX" -.LP +.LP routel [\fItablenr\fP [\fIraw ip args...\fP]] -.br +.br routef .SH "DESCRIPTION" -.LP +.LP These programs are a set of helper scripts you can use instead of raw iproute2 commands. .br The routel script will list routes in a format that some might consider easier to interpret then the ip route list equivalent. @@ -18,15 +18,15 @@ The routel script will list routes in a format that some might consider easier t The routef script does not take any arguments and will simply flush the routing table down the drain. Beware! This means deleting all routes which will make your network unusable! .SH "FILES" -.LP -\fI/usr/bin/routef\fP -.br -\fI/usr/bin/routel\fP +.LP +\fI/usr/bin/routef\fP +.br +\fI/usr/bin/routel\fP .SH "AUTHORS" -.LP +.LP The routel script was written by Stephen R. van den Berg , 1999/04/18 and donated to the public domain. .br This manual page was written by Andreas Henriksson , for the Debian GNU/Linux system. .SH "SEE ALSO" -.LP +.LP ip(8) diff --git a/man/man8/rtacct.8 b/man/man8/rtacct.8 index c3ab03de..7cf97aa4 100644 --- a/man/man8/rtacct.8 +++ b/man/man8/rtacct.8 @@ -47,4 +47,3 @@ Time interval to average rates. Default value is 60 seconds. .SH SEE ALSO lnstat(8) - diff --git a/man/man8/rtmon.8 b/man/man8/rtmon.8 index 05387520..38a2b774 100644 --- a/man/man8/rtmon.8 +++ b/man/man8/rtmon.8 @@ -10,11 +10,11 @@ This manual page documents briefly the command. .PP .B rtmon -listens on -.I netlink +listens on +.I netlink socket and monitors routing table changes. -.I rtmon +.I rtmon can be started before the first network configuration command is issued. For example if you insert: @@ -61,7 +61,7 @@ to display logged output from file. .SH SEE ALSO .BR ip (8) .SH AUTHOR -.B rtmon +.B rtmon was written by Alexey Kuznetsov . .PP This manual page was written by Michael Prokop , diff --git a/man/man8/ss.8 b/man/man8/ss.8 index 6afbabbf..f4d5264f 100644 --- a/man/man8/ss.8 +++ b/man/man8/ss.8 @@ -12,7 +12,7 @@ to It can display more TCP and state informations than other tools. .SH OPTIONS -When no option is used ss displays a list of +When no option is used ss displays a list of open non-listening sockets (e.g. TCP/UNIX/UDP) that have established connection. .TP .B \-h, \-\-help @@ -189,10 +189,10 @@ List all the tcp sockets in state FIN-WAIT-1 for our apache to network 193.233.7 .BR /usr/share/doc/iproute-doc/ss.html " (package iproute­doc)", .br .BR RFC " 793 " -- https://tools.ietf.org/rfc/rfc793.txt (TCP states) +- https://tools.ietf.org/rfc/rfc793.txt (TCP states) .SH AUTHOR -.I ss +.I ss was written by Alexey Kuznetsov, . .PP This manual page was written by Michael Prokop diff --git a/man/man8/tc-bfifo.8 b/man/man8/tc-bfifo.8 index f04090c0..3e290322 100644 --- a/man/man8/tc-bfifo.8 +++ b/man/man8/tc-bfifo.8 @@ -6,37 +6,37 @@ bfifo \- Byte limited First In, First Out queue .SH SYNOPSIS .B tc qdisc ... add pfifo -.B [ limit +.B [ limit packets .B ] .P .B tc qdisc ... add bfifo -.B [ limit +.B [ limit bytes .B ] .SH DESCRIPTION The pfifo and bfifo qdiscs are unadorned First In, First Out queues. They are the -simplest queues possible and therefore have no overhead. +simplest queues possible and therefore have no overhead. .B pfifo -constrains the queue size as measured in packets. +constrains the queue size as measured in packets. .B bfifo does so as measured in bytes. -Like all non-default qdiscs, they maintain statistics. This might be a reason to prefer +Like all non-default qdiscs, they maintain statistics. This might be a reason to prefer pfifo or bfifo over the default. .SH ALGORITHM A list of packets is maintained, when a packet is enqueued it gets inserted at the tail of -a list. When a packet needs to be sent out to the network, it is taken from the head of the list. +a list. When a packet needs to be sent out to the network, it is taken from the head of the list. If the list is too long, no further packets are allowed on. This is called 'tail drop'. .SH PARAMETERS -.TP +.TP limit -Maximum queue size. Specified in bytes for bfifo, in packets for pfifo. For pfifo, defaults -to the interface txqueuelen, as specified with +Maximum queue size. Specified in bytes for bfifo, in packets for pfifo. For pfifo, defaults +to the interface txqueuelen, as specified with .BR ifconfig (8) or .BR ip (8). @@ -48,20 +48,20 @@ The range for this parameter is [0, UINT32_MAX] bytes. Note: The link layer header was considered when counting packets length. .SH OUTPUT -The output of +The output of .B tc -s qdisc ls -contains the limit, either in packets or in bytes, and the number of bytes -and packets actually sent. An unsent and dropped packet only appears between braces +contains the limit, either in packets or in bytes, and the number of bytes +and packets actually sent. An unsent and dropped packet only appears between braces and is not counted as 'Sent'. -In this example, the queue length is 100 packets, 45894 bytes were sent over 681 packets. +In this example, the queue length is 100 packets, 45894 bytes were sent over 681 packets. No packets were dropped, and as the pfifo queue does not slow down packets, there were also no overlimits: .P .nf -# tc -s qdisc ls dev eth0 +# tc -s qdisc ls dev eth0 qdisc pfifo 8001: dev eth0 limit 100p - Sent 45894 bytes 681 pkts (dropped 0, overlimits 0) + Sent 45894 bytes 681 pkts (dropped 0, overlimits 0) .fi If a backlog occurs, this is displayed as well. @@ -72,5 +72,3 @@ If a backlog occurs, this is displayed as well. Alexey N. Kuznetsov, This manpage maintained by bert hubert - - diff --git a/man/man8/tc-cbq-details.8 b/man/man8/tc-cbq-details.8 index 86353b58..9368103b 100644 --- a/man/man8/tc-cbq-details.8 +++ b/man/man8/tc-cbq-details.8 @@ -5,54 +5,54 @@ CBQ \- Class Based Queueing .B tc qdisc ... dev dev .B ( parent -classid -.B | root) [ handle -major: +classid +.B | root) [ handle +major: .B ] cbq avpkt bytes .B bandwidth rate -.B [ cell +.B [ cell bytes .B ] [ ewma log .B ] [ mpu bytes -.B ] +.B ] .B tc class ... dev dev -.B parent +.B parent major:[minor] -.B [ classid +.B [ classid major:minor .B ] cbq allot bytes -.B [ bandwidth -rate -.B ] [ rate +.B [ bandwidth +rate +.B ] [ rate rate .B ] prio priority .B [ weight weight -.B ] [ minburst +.B ] [ minburst packets -.B ] [ maxburst -packets -.B ] [ ewma +.B ] [ maxburst +packets +.B ] [ ewma log .B ] [ cell bytes .B ] avpkt bytes .B [ mpu -bytes +bytes .B ] [ bounded isolated ] [ split handle .B & defmap defmap -.B ] [ estimator +.B ] [ estimator interval timeconstant .B ] @@ -60,7 +60,7 @@ interval timeconstant Class Based Queueing is a classful qdisc that implements a rich linksharing hierarchy of classes. It contains shaping elements as well as prioritizing capabilities. Shaping is performed using link -idle time calculations based on the timing of dequeue events and +idle time calculations based on the timing of dequeue events and underlying link bandwidth. .SH SHAPING ALGORITHM @@ -71,10 +71,10 @@ When shaping a 10mbit/s connection to 1mbit/s, the link will be idle 90% of the time. If it isn't, it needs to be throttled so that it IS idle 90% of the time. -From the kernel's perspective, this is hard to measure, so CBQ instead -derives the idle time from the number of microseconds (in fact, jiffies) -that elapse between requests from the device driver for more data. Combined -with the knowledge of packet sizes, this is used to approximate how full or +From the kernel's perspective, this is hard to measure, so CBQ instead +derives the idle time from the number of microseconds (in fact, jiffies) +that elapse between requests from the device driver for more data. Combined +with the knowledge of packet sizes, this is used to approximate how full or empty the link is. This is rather circumspect and doesn't always arrive at proper @@ -84,9 +84,9 @@ perhaps because of a badly implemented driver? A PCMCIA network card will also never achieve 100mbit/s because of the way the bus is designed - again, how do we calculate the idle time? -The physical link bandwidth may be ill defined in case of not-quite-real -network devices like PPP over Ethernet or PPTP over TCP/IP. The effective -bandwidth in that case is probably determined by the efficiency of pipes +The physical link bandwidth may be ill defined in case of not-quite-real +network devices like PPP over Ethernet or PPTP over TCP/IP. The effective +bandwidth in that case is probably determined by the efficiency of pipes to userspace - which not defined. During operations, the effective idletime is measured using an @@ -104,59 +104,59 @@ CBQ throttles and is then 'overlimit'. Conversely, an idle link might amass a huge avgidle, which would then allow infinite bandwidths after a few hours of silence. To prevent -this, avgidle is capped at +this, avgidle is capped at .B maxidle. If overlimit, in theory, the CBQ could throttle itself for exactly the amount of time that was calculated to pass between packets, and then pass one packet, and throttle again. Due to timer resolution constraints, -this may not be feasible, see the +this may not be feasible, see the .B minburst parameter below. .SH CLASSIFICATION Within the one CBQ instance many classes may exist. Each of these classes -contains another qdisc, by default +contains another qdisc, by default .BR tc-pfifo (8). -When enqueueing a packet, CBQ starts at the root and uses various methods to +When enqueueing a packet, CBQ starts at the root and uses various methods to determine which class should receive the data. If a verdict is reached, this process is repeated for the recipient class which might have further means of classifying traffic to its children, if any. -CBQ has the following methods available to classify a packet to any child +CBQ has the following methods available to classify a packet to any child classes. .TP (i) .B skb->priority class encoding. -Can be set from userspace by an application with the +Can be set from userspace by an application with the .B SO_PRIORITY setsockopt. -The +The .B skb->priority class encoding -only applies if the skb->priority holds a major:minor handle of an existing +only applies if the skb->priority holds a major:minor handle of an existing class within this qdisc. .TP (ii) tc filters attached to the class. .TP (iii) -The defmap of a class, as set with the +The defmap of a class, as set with the .B split & defmap parameters. The defmap may contain instructions for each possible Linux packet priority. .P -Each class also has a +Each class also has a .B level. Leaf nodes, attached to the bottom of the class hierarchy, have a level of 0. .SH CLASSIFICATION ALGORITHM -Classification is a loop, which terminates when a leaf class is found. At any +Classification is a loop, which terminates when a leaf class is found. At any point the loop may jump to the fallback algorithm. The loop consists of the following steps: -.TP +.TP (i) If the packet is generated locally and has a valid classid encoded within its .B skb->priority, @@ -169,40 +169,40 @@ a class which is not a leaf class, restart loop from the class returned. If it is a leaf, choose it and terminate. .TP (iii) -If the tc filters did not return a class, but did return a classid, -try to find a class with that id within this qdisc. +If the tc filters did not return a class, but did return a classid, +try to find a class with that id within this qdisc. Check if the found class is of a lower .B level than the current class. If so, and the returned class is not a leaf node, restart the loop at the found class. If it is a leaf node, terminate. -If we found an upward reference to a higher level, enter the fallback +If we found an upward reference to a higher level, enter the fallback algorithm. .TP (iv) If the tc filters did not return a class, nor a valid reference to one, consider the minor number of the reference to be the priority. Retrieve a class from the defmap of this class for the priority. If this did not -contain a class, consult the defmap of this class for the +contain a class, consult the defmap of this class for the +.B BEST_EFFORT +class. If this is an upward reference, or no .B BEST_EFFORT -class. If this is an upward reference, or no -.B BEST_EFFORT class was defined, enter the fallback algorithm. If a valid class was found, and it is not a -leaf node, restart the loop at this class. If it is a leaf, choose it and +leaf node, restart the loop at this class. If it is a leaf, choose it and terminate. If -neither the priority distilled from the classid, nor the -.B BEST_EFFORT +neither the priority distilled from the classid, nor the +.B BEST_EFFORT priority yielded a class, enter the fallback algorithm. .P The fallback algorithm resides outside of the loop and is as follows. .TP (i) -Consult the defmap of the class at which the jump to fallback occurred. If -the defmap contains a class for the +Consult the defmap of the class at which the jump to fallback occurred. If +the defmap contains a class for the .B priority -of the class (which is related to the TOS field), choose this class and -terminate. +of the class (which is related to the TOS field), choose this class and +terminate. .TP (ii) Consult the map for a class for the @@ -212,28 +212,28 @@ priority. If found, choose it, and terminate. (iii) Choose the class at which break out to the fallback algorithm occurred. Terminate. .P -The packet is enqueued to the class which was chosen when either algorithm +The packet is enqueued to the class which was chosen when either algorithm terminated. It is therefore possible for a packet to be enqueued *not* at a leaf node, but in the middle of the hierarchy. .SH LINK SHARING ALGORITHM -When dequeuing for sending to the network device, CBQ decides which of its +When dequeuing for sending to the network device, CBQ decides which of its classes will be allowed to send. It does so with a Weighted Round Robin process in which each class with packets gets a chance to send in turn. The WRR process -starts by asking the highest priority classes (lowest numerically - +starts by asking the highest priority classes (lowest numerically - highest semantically) for packets, and will continue to do so until they -have no more data to offer, in which case the process repeats for lower +have no more data to offer, in which case the process repeats for lower priorities. .B CERTAINTY ENDS HERE, ANK PLEASE HELP Each class is not allowed to send at length though - they can only dequeue a -configurable amount of data during each round. +configurable amount of data during each round. If a class is about to go overlimit, and it is not .B bounded it will try to borrow avgidle from siblings that are not -.B isolated. +.B isolated. This process is repeated from the bottom upwards. If a class is unable to borrow enough avgidle to send a packet, it is throttled and not asked for a packet for enough time for the avgidle to increase above zero. @@ -244,7 +244,7 @@ for a packet for enough time for the avgidle to increase above zero. .SH QDISC The root qdisc of a CBQ class tree has the following parameters: -.TP +.TP parent major:minor | root This mandatory parameter determines the place of the CBQ instance, either at the .B root @@ -259,22 +259,22 @@ For calculations, the average packet size must be known. It is silently capped at a minimum of 2/3 of the interface MTU. Mandatory. .TP bandwidth rate -To determine the idle time, CBQ must know the bandwidth of your underlying +To determine the idle time, CBQ must know the bandwidth of your underlying physical interface, or parent qdisc. This is a vital parameter, more about it later. Mandatory. .TP cell The cell size determines he granularity of packet transmission time calculations. Has a sensible default. -.TP +.TP mpu A zero sized packet may still take time to transmit. This value is the lower cap for packet transmission time calculations - packets smaller than this value are still deemed to have this size. Defaults to zero. .TP ewma log -When CBQ needs to measure the average idle time, it does so using an +When CBQ needs to measure the average idle time, it does so using an Exponentially Weighted Moving Average which smooths out measurements into -a moving average. The EWMA LOG determines how much smoothing occurs. Defaults +a moving average. The EWMA LOG determines how much smoothing occurs. Defaults to 5. Lower values imply greater sensitivity. Must be between 0 and 31. .P A CBQ qdisc does not shape out of its own accord. It only needs to know certain @@ -283,35 +283,35 @@ parameters about the underlying link. Actual shaping is done in classes. .SH CLASSES Classes have a host of parameters to configure their operation. -.TP +.TP parent major:minor -Place of this class within the hierarchy. If attached directly to a qdisc +Place of this class within the hierarchy. If attached directly to a qdisc and not to another class, minor can be omitted. Mandatory. -.TP +.TP classid major:minor Like qdiscs, classes can be named. The major number must be equal to the -major number of the qdisc to which it belongs. Optional, but needed if this +major number of the qdisc to which it belongs. Optional, but needed if this class is going to have children. -.TP +.TP weight weight -When dequeuing to the interface, classes are tried for traffic in a +When dequeuing to the interface, classes are tried for traffic in a round-robin fashion. Classes with a higher configured qdisc will generally have more traffic to offer during each round, so it makes sense to allow it to dequeue more traffic. All weights under a class are normalized, so -only the ratios matter. Defaults to the configured rate, unless the priority +only the ratios matter. Defaults to the configured rate, unless the priority of this class is maximal, in which case it is set to 1. -.TP +.TP allot bytes Allot specifies how many bytes a qdisc can dequeue -during each round of the process. This parameter is weighted using the +during each round of the process. This parameter is weighted using the renormalized class weight described above. -.TP +.TP priority priority -In the round-robin process, classes with the lowest priority field are tried +In the round-robin process, classes with the lowest priority field are tried for packets first. Mandatory. -.TP +.TP rate rate Maximum rate this class and all its children combined can send at. Mandatory. @@ -321,7 +321,7 @@ This is different from the bandwidth specified when creating a CBQ disc. Only used to determine maxidle and offtime, which are only calculated when specifying maxburst or minburst. Mandatory if specifying maxburst or minburst. -.TP +.TP maxburst This number of packets is used to calculate maxidle so that when avgidle is at maxidle, this number of average packets can be burst @@ -329,7 +329,7 @@ before avgidle drops to 0. Set it higher to be more tolerant of bursts. You can't set maxidle directly, only via this parameter. .TP -minburst +minburst As mentioned before, CBQ needs to throttle in case of overlimit. The ideal solution is to do so for exactly the calculated idle time, and pass 1 packet. However, Unix kernels generally have a @@ -352,21 +352,21 @@ Minidle is specified in negative microseconds, so 10 means that avgidle is capped at -10us. .TP -bounded +bounded Signifies that this class will not borrow bandwidth from its siblings. -.TP +.TP isolated Means that this class will not borrow bandwidth to its siblings -.TP +.TP split major:minor & defmap bitmap[/bitmap] -If consulting filters attached to a class did not give a verdict, +If consulting filters attached to a class did not give a verdict, CBQ can also classify based on the packet's priority. There are 16 -priorities available, numbered from 0 to 15. +priorities available, numbered from 0 to 15. -The defmap specifies which priorities this class wants to receive, -specified as a bitmap. The Least Significant Bit corresponds to priority -zero. The +The defmap specifies which priorities this class wants to receive, +specified as a bitmap. The Least Significant Bit corresponds to priority +zero. The .B split parameter tells CBQ at which class the decision must be made, which should be a (grand)parent of the class you are adding. @@ -374,7 +374,7 @@ be a (grand)parent of the class you are adding. As an example, 'tc class add ... classid 10:1 cbq .. split 10:0 defmap c0' configures class 10:0 to send packets with priorities 6 and 7 to 10:1. -The complimentary configuration would then +The complimentary configuration would then be: 'tc class add ... classid 10:2 cbq ... split 10:0 defmap 3f' Which would send all packets 0, 1, 2, 3, 4 and 5 to 10:1. .TP @@ -384,11 +384,11 @@ can use to classify packets with. In order to determine the bandwidth it uses a very simple estimator that measures once every .B interval microseconds how much traffic has passed. This again is a EWMA, for which -the time constant can be specified, also in microseconds. The +the time constant can be specified, also in microseconds. The .B time constant -corresponds to the sluggishness of the measurement or, conversely, to the +corresponds to the sluggishness of the measurement or, conversely, to the sensitivity of the average to short bursts. Higher values mean less -sensitivity. +sensitivity. @@ -399,7 +399,7 @@ Sally Floyd and Van Jacobson, "Link-sharing and Resource Management Models for Packet Networks", IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 -.TP +.TP o Sally Floyd, "Notes on CBQ and Guarantee Service", 1995 @@ -408,7 +408,7 @@ o Sally Floyd, "Notes on Class-Based Queueing: Setting Parameters", 1996 -.TP +.TP o Sally Floyd and Michael Speer, "Experimental Results for Class-Based Queueing", 1998, not published. @@ -421,5 +421,3 @@ for Class-Based Queueing", 1998, not published. .SH AUTHOR Alexey N. Kuznetsov, . This manpage maintained by bert hubert - - diff --git a/man/man8/tc-cbq.8 b/man/man8/tc-cbq.8 index b900e1c3..301265d8 100644 --- a/man/man8/tc-cbq.8 +++ b/man/man8/tc-cbq.8 @@ -5,56 +5,56 @@ CBQ \- Class Based Queueing .B tc qdisc ... dev dev .B ( parent -classid -.B | root) [ handle -major: -.B ] cbq [ allot +classid +.B | root) [ handle +major: +.B ] cbq [ allot bytes .B ] avpkt bytes .B bandwidth rate -.B [ cell +.B [ cell bytes .B ] [ ewma log .B ] [ mpu bytes -.B ] +.B ] .B tc class ... dev dev -.B parent +.B parent major:[minor] -.B [ classid +.B [ classid major:minor .B ] cbq allot bytes -.B [ bandwidth -rate -.B ] [ rate +.B [ bandwidth +rate +.B ] [ rate rate .B ] prio priority .B [ weight weight -.B ] [ minburst +.B ] [ minburst packets -.B ] [ maxburst -packets -.B ] [ ewma +.B ] [ maxburst +packets +.B ] [ ewma log .B ] [ cell bytes .B ] avpkt bytes .B [ mpu -bytes +bytes .B ] [ bounded isolated ] [ split handle .B & defmap defmap -.B ] [ estimator +.B ] [ estimator interval timeconstant .B ] @@ -62,7 +62,7 @@ interval timeconstant Class Based Queueing is a classful qdisc that implements a rich linksharing hierarchy of classes. It contains shaping elements as well as prioritizing capabilities. Shaping is performed using link -idle time calculations based on the timing of dequeue events and +idle time calculations based on the timing of dequeue events and underlying link bandwidth. .SH SHAPING ALGORITHM @@ -85,71 +85,71 @@ CBQ throttles and is then 'overlimit'. Conversely, an idle link might amass a huge avgidle, which would then allow infinite bandwidths after a few hours of silence. To prevent -this, avgidle is capped at +this, avgidle is capped at .B maxidle. If overlimit, in theory, the CBQ could throttle itself for exactly the amount of time that was calculated to pass between packets, and then pass one packet, and throttle again. Due to timer resolution constraints, -this may not be feasible, see the +this may not be feasible, see the .B minburst parameter below. .SH CLASSIFICATION Within the one CBQ instance many classes may exist. Each of these classes -contains another qdisc, by default +contains another qdisc, by default .BR tc-pfifo (8). -When enqueueing a packet, CBQ starts at the root and uses various methods to -determine which class should receive the data. +When enqueueing a packet, CBQ starts at the root and uses various methods to +determine which class should receive the data. -In the absence of uncommon configuration options, the process is rather easy. -At each node we look for an instruction, and then go to the class the -instruction refers us to. If the class found is a barren leaf-node (without -children), we enqueue the packet there. If it is not yet a leaf node, we do -the whole thing over again starting from that node. +In the absence of uncommon configuration options, the process is rather easy. +At each node we look for an instruction, and then go to the class the +instruction refers us to. If the class found is a barren leaf-node (without +children), we enqueue the packet there. If it is not yet a leaf node, we do +the whole thing over again starting from that node. -The following actions are performed, in order at each node we visit, until one +The following actions are performed, in order at each node we visit, until one sends us to another node, or terminates the process. .TP (i) -Consult filters attached to the class. If sent to a leafnode, we are done. +Consult filters attached to the class. If sent to a leafnode, we are done. Otherwise, restart. .TP (ii) -Consult the defmap for the priority assigned to this packet, which depends +Consult the defmap for the priority assigned to this packet, which depends on the TOS bits. Check if the referral is leafless, otherwise restart. .TP (iii) -Ask the defmap for instructions for the 'best effort' priority. Check the +Ask the defmap for instructions for the 'best effort' priority. Check the answer for leafness, otherwise restart. .TP (iv) If none of the above returned with an instruction, enqueue at this node. .P This algorithm makes sure that a packet always ends up somewhere, even while -you are busy building your configuration. +you are busy building your configuration. For more details, see .BR tc-cbq-details(8). .SH LINK SHARING ALGORITHM -When dequeuing for sending to the network device, CBQ decides which of its +When dequeuing for sending to the network device, CBQ decides which of its classes will be allowed to send. It does so with a Weighted Round Robin process in which each class with packets gets a chance to send in turn. The WRR process -starts by asking the highest priority classes (lowest numerically - +starts by asking the highest priority classes (lowest numerically - highest semantically) for packets, and will continue to do so until they -have no more data to offer, in which case the process repeats for lower +have no more data to offer, in which case the process repeats for lower priorities. -Classes by default borrow bandwidth from their siblings. A class can be -prevented from doing so by declaring it 'bounded'. A class can also indicate +Classes by default borrow bandwidth from their siblings. A class can be +prevented from doing so by declaring it 'bounded'. A class can also indicate its unwillingness to lend out bandwidth by being 'isolated'. .SH QDISC The root of a CBQ qdisc class tree has the following parameters: -.TP +.TP parent major:minor | root This mandatory parameter determines the place of the CBQ instance, either at the .B root @@ -159,7 +159,7 @@ handle major: Like all other qdiscs, the CBQ can be assigned a handle. Should consist only of a major number, followed by a colon. Optional, but very useful if classes will be generated within this qdisc. -.TP +.TP allot bytes This allotment is the 'chunkiness' of link sharing and is used for determining packet transmission time tables. The qdisc allot differs slightly from the class allot discussed @@ -170,23 +170,23 @@ The average size of a packet is needed for calculating maxidle, and is also used for making sure 'allot' has a safe value. Mandatory. .TP bandwidth rate -To determine the idle time, CBQ must know the bandwidth of your underlying +To determine the idle time, CBQ must know the bandwidth of your underlying physical interface, or parent qdisc. This is a vital parameter, more about it later. Mandatory. .TP cell The cell size determines he granularity of packet transmission time calculations. Has a sensible default. -.TP +.TP mpu A zero sized packet may still take time to transmit. This value is the lower cap for packet transmission time calculations - packets smaller than this value are still deemed to have this size. Defaults to zero. .TP ewma log -When CBQ needs to measure the average idle time, it does so using an +When CBQ needs to measure the average idle time, it does so using an Exponentially Weighted Moving Average which smooths out measurements into -a moving average. The EWMA LOG determines how much smoothing occurs. Lower -values imply greater sensitivity. Must be between 0 and 31. Defaults +a moving average. The EWMA LOG determines how much smoothing occurs. Lower +values imply greater sensitivity. Must be between 0 and 31. Defaults to 5. .P A CBQ qdisc does not shape out of its own accord. It only needs to know certain @@ -195,40 +195,40 @@ parameters about the underlying link. Actual shaping is done in classes. .SH CLASSES Classes have a host of parameters to configure their operation. -.TP +.TP parent major:minor -Place of this class within the hierarchy. If attached directly to a qdisc +Place of this class within the hierarchy. If attached directly to a qdisc and not to another class, minor can be omitted. Mandatory. -.TP +.TP classid major:minor Like qdiscs, classes can be named. The major number must be equal to the -major number of the qdisc to which it belongs. Optional, but needed if this +major number of the qdisc to which it belongs. Optional, but needed if this class is going to have children. -.TP +.TP weight weight -When dequeuing to the interface, classes are tried for traffic in a +When dequeuing to the interface, classes are tried for traffic in a round-robin fashion. Classes with a higher configured qdisc will generally have more traffic to offer during each round, so it makes sense to allow it to dequeue more traffic. All weights under a class are normalized, so -only the ratios matter. Defaults to the configured rate, unless the priority +only the ratios matter. Defaults to the configured rate, unless the priority of this class is maximal, in which case it is set to 1. -.TP +.TP allot bytes Allot specifies how many bytes a qdisc can dequeue -during each round of the process. This parameter is weighted using the +during each round of the process. This parameter is weighted using the renormalized class weight described above. Silently capped at a minimum of 3/2 avpkt. Mandatory. -.TP +.TP prio priority -In the round-robin process, classes with the lowest priority field are tried +In the round-robin process, classes with the lowest priority field are tried for packets first. Mandatory. -.TP +.TP avpkt See the QDISC section. -.TP +.TP rate rate Maximum rate this class and all its children combined can send at. Mandatory. @@ -238,7 +238,7 @@ This is different from the bandwidth specified when creating a CBQ disc! Only used to determine maxidle and offtime, which are only calculated when specifying maxburst or minburst. Mandatory if specifying maxburst or minburst. -.TP +.TP maxburst This number of packets is used to calculate maxidle so that when avgidle is at maxidle, this number of average packets can be burst @@ -246,7 +246,7 @@ before avgidle drops to 0. Set it higher to be more tolerant of bursts. You can't set maxidle directly, only via this parameter. .TP -minburst +minburst As mentioned before, CBQ needs to throttle in case of overlimit. The ideal solution is to do so for exactly the calculated idle time, and pass 1 packet. However, Unix kernels generally have a @@ -269,21 +269,21 @@ Minidle is specified in negative microseconds, so 10 means that avgidle is capped at -10us. Optional. .TP -bounded +bounded Signifies that this class will not borrow bandwidth from its siblings. -.TP +.TP isolated Means that this class will not borrow bandwidth to its siblings -.TP +.TP split major:minor & defmap bitmap[/bitmap] -If consulting filters attached to a class did not give a verdict, +If consulting filters attached to a class did not give a verdict, CBQ can also classify based on the packet's priority. There are 16 -priorities available, numbered from 0 to 15. +priorities available, numbered from 0 to 15. -The defmap specifies which priorities this class wants to receive, -specified as a bitmap. The Least Significant Bit corresponds to priority -zero. The +The defmap specifies which priorities this class wants to receive, +specified as a bitmap. The Least Significant Bit corresponds to priority +zero. The .B split parameter tells CBQ at which class the decision must be made, which should be a (grand)parent of the class you are adding. @@ -291,7 +291,7 @@ be a (grand)parent of the class you are adding. As an example, 'tc class add ... classid 10:1 cbq .. split 10:0 defmap c0' configures class 10:0 to send packets with priorities 6 and 7 to 10:1. -The complimentary configuration would then +The complimentary configuration would then be: 'tc class add ... classid 10:2 cbq ... split 10:0 defmap 3f' Which would send all packets 0, 1, 2, 3, 4 and 5 to 10:1. .TP @@ -301,22 +301,22 @@ can use to classify packets with. In order to determine the bandwidth it uses a very simple estimator that measures once every .B interval microseconds how much traffic has passed. This again is a EWMA, for which -the time constant can be specified, also in microseconds. The +the time constant can be specified, also in microseconds. The .B time constant -corresponds to the sluggishness of the measurement or, conversely, to the +corresponds to the sluggishness of the measurement or, conversely, to the sensitivity of the average to short bursts. Higher values mean less -sensitivity. +sensitivity. .SH BUGS -The actual bandwidth of the underlying link may not be known, for example -in the case of PPoE or PPTP connections which in fact may send over a +The actual bandwidth of the underlying link may not be known, for example +in the case of PPoE or PPTP connections which in fact may send over a pipe, instead of over a physical device. CBQ is quite resilient to major errors in the configured bandwidth, probably a the cost of coarser shaping. -Default kernels rely on coarse timing information for making decisions. These +Default kernels rely on coarse timing information for making decisions. These may make shaping precise in the long term, but inaccurate on second long scales. -See +See .BR tc-cbq-details(8) for hints on how to improve this. @@ -327,7 +327,7 @@ Sally Floyd and Van Jacobson, "Link-sharing and Resource Management Models for Packet Networks", IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 -.TP +.TP o Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 @@ -336,7 +336,7 @@ o Sally Floyd, "Notes on Class-Based Queueing: Setting Parameters", 1996 -.TP +.TP o Sally Floyd and Michael Speer, "Experimental Results for Class-Based Queueing", 1998, not published. @@ -349,5 +349,3 @@ for Class-Based Queueing", 1998, not published. .SH AUTHOR Alexey N. Kuznetsov, . This manpage maintained by bert hubert - - diff --git a/man/man8/tc-drr.8 b/man/man8/tc-drr.8 index f550a35d..2fea4ee2 100644 --- a/man/man8/tc-drr.8 +++ b/man/man8/tc-drr.8 @@ -92,4 +92,3 @@ as limits are handled by the individual child qdiscs. .SH AUTHOR sched_drr was written by Patrick McHardy. - diff --git a/man/man8/tc-htb.8 b/man/man8/tc-htb.8 index 95f25dea..ae310f43 100644 --- a/man/man8/tc-htb.8 +++ b/man/man8/tc-htb.8 @@ -5,30 +5,30 @@ HTB \- Hierarchy Token Bucket .B tc qdisc ... dev dev .B ( parent -classid -.B | root) [ handle -major: -.B ] htb [ default +classid +.B | root) [ handle +major: +.B ] htb [ default minor-id -.B ] +.B ] .B tc class ... dev dev -.B parent +.B parent major:[minor] -.B [ classid +.B [ classid major:minor .B ] htb rate rate .B [ ceil -rate -.B ] burst +rate +.B ] burst bytes .B [ cburst bytes .B ] [ prio priority -.B ] +.B ] .SH DESCRIPTION HTB is meant as a more understandable and intuitive replacement for @@ -37,9 +37,9 @@ of the outbound bandwidth on a given link. Both allow you to use one physical link to simulate several slower links and to send different kinds of traffic on different simulated links. In both cases, you have to specify how to divide the physical link into simulated links and -how to decide which simulated link to use for a given packet to be sent. +how to decide which simulated link to use for a given packet to be sent. -Unlike CBQ, HTB shapes traffic based on the Token Bucket Filter algorithm +Unlike CBQ, HTB shapes traffic based on the Token Bucket Filter algorithm which does not depend on interface characteristics and so does not need to know the underlying bandwidth of the outgoing interface. @@ -49,30 +49,30 @@ Shaping works as documented in .SH CLASSIFICATION Within the one HTB instance many classes may exist. Each of these classes -contains another qdisc, by default +contains another qdisc, by default .BR tc-pfifo (8). -When enqueueing a packet, HTB starts at the root and uses various methods to -determine which class should receive the data. +When enqueueing a packet, HTB starts at the root and uses various methods to +determine which class should receive the data. -In the absence of uncommon configuration options, the process is rather easy. -At each node we look for an instruction, and then go to the class the -instruction refers us to. If the class found is a barren leaf-node (without -children), we enqueue the packet there. If it is not yet a leaf node, we do -the whole thing over again starting from that node. +In the absence of uncommon configuration options, the process is rather easy. +At each node we look for an instruction, and then go to the class the +instruction refers us to. If the class found is a barren leaf-node (without +children), we enqueue the packet there. If it is not yet a leaf node, we do +the whole thing over again starting from that node. -The following actions are performed, in order at each node we visit, until one +The following actions are performed, in order at each node we visit, until one sends us to another node, or terminates the process. .TP (i) -Consult filters attached to the class. If sent to a leafnode, we are done. +Consult filters attached to the class. If sent to a leafnode, we are done. Otherwise, restart. .TP (ii) If none of the above returned with an instruction, enqueue at this node. .P This algorithm makes sure that a packet always ends up somewhere, even while -you are busy building your configuration. +you are busy building your configuration. .SH LINK SHARING ALGORITHM FIXME @@ -80,7 +80,7 @@ FIXME .SH QDISC The root of a HTB qdisc class tree has the following parameters: -.TP +.TP parent major:minor | root This mandatory parameter determines the place of the HTB instance, either at the .B root @@ -90,54 +90,54 @@ handle major: Like all other qdiscs, the HTB can be assigned a handle. Should consist only of a major number, followed by a colon. Optional, but very useful if classes will be generated within this qdisc. -.TP +.TP default minor-id Unclassified traffic gets sent to the class with this minor-id. .SH CLASSES Classes have a host of parameters to configure their operation. -.TP +.TP parent major:minor -Place of this class within the hierarchy. If attached directly to a qdisc +Place of this class within the hierarchy. If attached directly to a qdisc and not to another class, minor can be omitted. Mandatory. -.TP +.TP classid major:minor Like qdiscs, classes can be named. The major number must be equal to the -major number of the qdisc to which it belongs. Optional, but needed if this +major number of the qdisc to which it belongs. Optional, but needed if this class is going to have children. -.TP +.TP prio priority -In the round-robin process, classes with the lowest priority field are tried +In the round-robin process, classes with the lowest priority field are tried for packets first. Mandatory. -.TP +.TP rate rate Maximum rate this class and all its children are guaranteed. Mandatory. .TP ceil rate -Maximum rate at which a class can send, if its parent has bandwidth to spare. +Maximum rate at which a class can send, if its parent has bandwidth to spare. Defaults to the configured rate, which implies no borrowing -.TP +.TP burst bytes -Amount of bytes that can be burst at +Amount of bytes that can be burst at .B ceil speed, in excess of the configured -.B rate. +.B rate. Should be at least as high as the highest burst of all children. -.TP +.TP cburst bytes Amount of bytes that can be burst at 'infinite' speed, in other words, as fast as the interface can transmit them. For perfect evening out, should be equal to at most one average packet. Should be at least as high as the highest cburst of all children. .SH NOTES -Due to Unix timing constraints, the maximum ceil rate is not infinite and may in fact be quite low. On Intel, +Due to Unix timing constraints, the maximum ceil rate is not infinite and may in fact be quite low. On Intel, there are 100 timer events per second, the maximum rate is that rate at which 'burst' bytes are sent each timer tick. -From this, the minimum burst size for a specified rate can be calculated. For i386, a 10mbit rate requires a 12 kilobyte +From this, the minimum burst size for a specified rate can be calculated. For i386, a 10mbit rate requires a 12 kilobyte burst as 100*12kb*8 equals 10mbit. .SH SEE ALSO @@ -146,5 +146,3 @@ burst as 100*12kb*8 equals 10mbit. HTB website: http://luxik.cdi.cz/~devik/qos/htb/ .SH AUTHOR Martin Devera . This manpage maintained by bert hubert - - diff --git a/man/man8/tc-netem.8 b/man/man8/tc-netem.8 index 53c4de97..b31384f5 100644 --- a/man/man8/tc-netem.8 +++ b/man/man8/tc-netem.8 @@ -2,9 +2,9 @@ .SH NAME NetEm \- Network Emulator .SH SYNOPSIS -.B "tc qdisc ... dev" +.B "tc qdisc ... dev" .IR DEVICE " ] " -.BR "add netem" +.BR "add netem" .I OPTIONS .IR OPTIONS " := [ " LIMIT " ] [ " DELAY " ] [ " LOSS \ @@ -15,15 +15,15 @@ NetEm \- Network Emulator .I packets .IR DELAY " := " -.BI delay +.BI delay .IR TIME " [ " JITTER " [ " CORRELATION " ]]]" .br - [ + [ .BR distribution " { "uniform " | " normal " | " pareto " | " paretonormal " } ]" .IR LOSS " := " .BR loss " { " -.BI random +.BI random .IR PERCENT " [ " CORRELATION " ] |" .br .RB " " state @@ -44,13 +44,13 @@ NetEm \- Network Emulator .IR REORDERING " := " .B reorder .IR PERCENT " [ " CORRELATION " ] [ " -.B gap +.B gap .IR DISTANCE " ]" .IR RATE " := " .B rate .IR RATE " [ " PACKETOVERHEAD " [ " CELLSIZE " [ " CELLOVERHEAD " ]]]]" - + .SH DESCRIPTION NetEm is an enhancement of the Linux traffic control facilities @@ -139,11 +139,11 @@ in this second example 25% of packets are sent immediately (with correlation of 50%) while the others are delayed by 10 ms. .SS rate -delay packets based on packet size and is a replacement for +delay packets based on packet size and is a replacement for .IR TBF . Rate can be -specified in common units (e.g. 100kbit). Optional -.I PACKETOVERHEAD +specified in common units (e.g. 100kbit). Optional +.I PACKETOVERHEAD (in bytes) specify an per packet overhead and can be negative. A positive value can be used to simulate additional link layer headers. A negative value can be used to artificial strip the Ethernet header (e.g. -14) and/or simulate a link layer @@ -152,7 +152,7 @@ the cellsize. Cellsize can be used to simulate link layer schemes. ATM for example has an payload cellsize of 48 bytes and 5 byte per cell header. If a packet is 50 byte then ATM must use two cells: 2 * 48 bytes payload including 2 * 5 byte header, thus consume 106 byte on the wire. The last optional value -.I CELLOVERHEAD +.I CELLOVERHEAD can be used to specify per cell overhead - for our ATM example 5. .I CELLOVERHEAD can be negative, but use negative values with caution. diff --git a/man/man8/tc-pfifo_fast.8 b/man/man8/tc-pfifo_fast.8 index 43ab166e..baf34b1d 100644 --- a/man/man8/tc-pfifo_fast.8 +++ b/man/man8/tc-pfifo_fast.8 @@ -13,14 +13,14 @@ is detached. In this sense this qdisc is magic, and unlike other qdiscs. .SH ALGORITHM -The algorithm is very similar to that of the classful +The algorithm is very similar to that of the classful .BR tc-prio (8) -qdisc. +qdisc. .B pfifo_fast is like three .BR tc-pfifo (8) queues side by side, where packets can be enqueued in any of the three bands -based on their Type of Service bits or assigned priority. +based on their Type of Service bits or assigned priority. Not all three bands are dequeued simultaneously - as long as lower bands have traffic, higher bands are never dequeued. This can be used to @@ -28,7 +28,7 @@ prioritize interactive traffic or penalize 'lowest cost' traffic. Each band can be txqueuelen packets long, as configured with .BR ifconfig (8) -or +or .BR ip (8). Additional packets coming in are not enqueued but are instead dropped. @@ -36,7 +36,7 @@ See .BR tc-prio (8) for complete details on how TOS bits are translated into bands. .SH PARAMETERS -.TP +.TP txqueuelen The length of the three bands depends on the interface txqueuelen, as specified with @@ -46,7 +46,7 @@ or .SH BUGS Does not maintain statistics and does not show up in tc qdisc ls. This is because -it is the automatic default in the absence of a configured qdisc. +it is the automatic default in the absence of a configured qdisc. .SH SEE ALSO .BR tc (8) @@ -55,5 +55,3 @@ it is the automatic default in the absence of a configured qdisc. Alexey N. Kuznetsov, This manpage maintained by bert hubert - - diff --git a/man/man8/tc-prio.8 b/man/man8/tc-prio.8 index 99a4a261..605f3d39 100644 --- a/man/man8/tc-prio.8 +++ b/man/man8/tc-prio.8 @@ -5,21 +5,21 @@ PRIO \- Priority qdisc .B tc qdisc ... dev dev .B ( parent -classid -.B | root) [ handle -major: -.B ] prio [ bands +classid +.B | root) [ handle +major: +.B ] prio [ bands bands .B ] [ priomap band band band... -.B ] [ estimator +.B ] [ estimator interval timeconstant .B ] .SH DESCRIPTION The PRIO qdisc is a simple classful queueing discipline that contains an arbitrary number of classes of differing priority. The classes are -dequeued in numerical descending order of priority. PRIO is a scheduler +dequeued in numerical descending order of priority. PRIO is a scheduler and never delays packets - it is a work-conserving qdisc, though the qdiscs contained in the classes may not be. @@ -51,22 +51,22 @@ From userspace A process with sufficient privileges can encode the destination class directly with SO_PRIORITY, see .BR socket(7). -.TP +.TP with a tc filter A tc filter attached to the root qdisc can point traffic directly to a class -.TP +.TP with the priomap Based on the packet priority, which in turn is derived from the Type of Service assigned to the packet. .P -Only the priomap is specific to this qdisc. +Only the priomap is specific to this qdisc. .SH QDISC PARAMETERS .TP bands Number of bands. If changed from the default of 3, .B priomap must be updated as well. -.TP +.TP priomap The priomap maps the priority of a packet to a class. The priority can either be set directly from userspace, @@ -126,7 +126,7 @@ TOS Bits Means Linux Priority Band The second column contains the value of the relevant four TOS bits, followed by their translated meaning. For example, 15 stands for a packet wanting Minimal Monetary Cost, Maximum Reliability, Maximum -Throughput AND Minimum Delay. +Throughput AND Minimum Delay. The fourth column lists the way the Linux kernel interprets the TOS bits, by showing to which Priority they are mapped. @@ -151,7 +151,7 @@ FTP TFTP 1000 (minimize delay) -SMTP +SMTP Command phase 1000 (minimize delay) DATA phase 0100 (maximize throughput) @@ -176,12 +176,10 @@ further qdisc. .SH BUGS Large amounts of traffic in the lower bands can cause starvation of higher -bands. Can be prevented by attaching a shaper (for example, +bands. Can be prevented by attaching a shaper (for example, .BR tc-tbf(8) to these bands to make sure they cannot dominate the link. .SH AUTHORS Alexey N. Kuznetsov, , J Hadi Salim . This manpage maintained by bert hubert - - diff --git a/man/man8/tc-red.8 b/man/man8/tc-red.8 index d001c498..dd1ab74c 100644 --- a/man/man8/tc-red.8 +++ b/man/man8/tc-red.8 @@ -1,17 +1,17 @@ .TH RED 8 "13 December 2001" "iproute2" "Linux" .SH NAME -red \- Random Early Detection +red \- Random Early Detection .SH SYNOPSIS .B tc qdisc ... red -.B limit +.B limit +bytes +.B [ min +bytes +.B ] [ max bytes -.B [ min -bytes -.B ] [ max -bytes .B ] avpkt bytes -.B [ burst +.B [ burst packets .B ] [ ecn ] [ harddrop] [ bandwidth rate @@ -46,51 +46,51 @@ The average queue size is used for determining the marking probability. This is calculated using an Exponential Weighted Moving Average, which can be more or less sensitive to bursts. -When the average queue size is below +When the average queue size is below .B min -bytes, no packet will ever be marked. When it exceeds -.B min, +bytes, no packet will ever be marked. When it exceeds +.B min, the probability of doing so climbs linearly up -to -.B probability, +to +.B probability, until the average queue size hits .B max -bytes. Because -.B probability +bytes. Because +.B probability is normally not set to 100%, the queue size might -conceivably rise above +conceivably rise above .B max -bytes, so the +bytes, so the .B limit parameter is provided to set a hard maximum for the size of the queue. .SH PARAMETERS -.TP +.TP min Average queue size at which marking becomes a possibility. Defaults to .B max /3 -.TP +.TP max At this average queue size, the marking probability is maximal. Should be at least twice .B min -to prevent synchronous retransmits, higher for low +to prevent synchronous retransmits, higher for low .B min. -Default to +Default to .B limit /4 -.TP +.TP probability Maximum probability for marking, specified as a floating point number from 0.0 to 1.0. Suggested values are 0.01 or 0.02 (1 or 2%, respectively). Default : 0.02 -.TP +.TP limit Hard limit on the real (not average) queue size in bytes. Further packets are dropped. Should be set higher than max+burst. It is advised to set this -a few times higher than +a few times higher than .B max. .TP burst @@ -98,7 +98,7 @@ Used for determining how fast the average queue size is influenced by the real queue size. Larger values make the calculation more sluggish, allowing longer bursts of traffic before marking starts. Real life experiments support the following guideline: (min+min+max)/(3*avpkt). -.TP +.TP avpkt Specified in bytes. Used with burst to determine the time constant for average queue size calculations. 1000 is a good value. @@ -126,15 +126,15 @@ bytes, this parameter forces a drop instead of ecn marking. adaptive (Added in linux-3.3) Sets RED in adaptive mode as described in http://icir.org/floyd/papers/adaptiveRed.pdf .nf -Goal of Adaptive RED is to make 'probability' dynamic value between 1% and 50% to reach the target average queue : +Goal of Adaptive RED is to make 'probability' dynamic value between 1% and 50% to reach the target average queue : .B (max - min) / 2 .fi .SH EXAMPLE .P -# tc qdisc add dev eth0 parent 1:1 handle 10: red - limit 400000 min 30000 max 90000 avpkt 1000 +# tc qdisc add dev eth0 parent 1:1 handle 10: red + limit 400000 min 30000 max 90000 avpkt 1000 burst 55 ecn adaptive bandwidth 10Mbit .SH SEE ALSO @@ -142,11 +142,11 @@ Goal of Adaptive RED is to make 'probability' dynamic value between 1% and 50% t .BR tc-choke (8) .SH SOURCES -.TP +.TP o Floyd, S., and Jacobson, V., Random Early Detection gateways for Congestion Avoidance. http://www.aciri.org/floyd/papers/red/red.html -.TP +.TP o Some changes to the algorithm by Alexey N. Kuznetsov. .TP @@ -156,7 +156,5 @@ Adaptive RED : http://icir.org/floyd/papers/adaptiveRed.pdf .SH AUTHORS Alexey N. Kuznetsov, , Alexey Makarenko , J Hadi Salim , -Eric Dumazet . +Eric Dumazet . This manpage maintained by bert hubert - - diff --git a/man/man8/tc-sfq.8 b/man/man8/tc-sfq.8 index 9afb5b24..ec4d8b8d 100644 --- a/man/man8/tc-sfq.8 +++ b/man/man8/tc-sfq.8 @@ -33,11 +33,11 @@ P .SH DESCRIPTION Stochastic Fairness Queueing is a classless queueing discipline available for -traffic control with the +traffic control with the .BR tc (8) command. -SFQ does not shape traffic but only schedules the transmission of packets, based on 'flows'. +SFQ does not shape traffic but only schedules the transmission of packets, based on 'flows'. The goal is to ensure fairness so that each flow is able to send data in turn, thus preventing any single flow from drowning out the rest. @@ -62,13 +62,13 @@ Destination address (iii) Source and Destination port .P -If these are available. SFQ knows about ipv4 and ipv6 and also UDP, TCP and ESP. -Packets with other protocols are hashed based on the 32bits representation of their +If these are available. SFQ knows about ipv4 and ipv6 and also UDP, TCP and ESP. +Packets with other protocols are hashed based on the 32bits representation of their destination and source. A flow corresponds mostly to a TCP/IP connection. Each of these buckets should represent a unique flow. Because multiple flows may -get hashed to the same bucket, sfqs internal hashing algorithm may be perturbed at configurable -intervals so that the unfairness lasts only for a short while. Perturbation may +get hashed to the same bucket, sfqs internal hashing algorithm may be perturbed at configurable +intervals so that the unfairness lasts only for a short while. Perturbation may however cause some inadvertent packet reordering to occur. After linux-3.3, there is no packet reordering problem, but possible packet drops if rehashing hits one limit (number of flows or packets per flow) @@ -88,7 +88,7 @@ divisor Can be used to set a different hash table size, available from kernel 2.6.39 onwards. The specified divisor must be a power of two and cannot be larger than 65536. Default value: 1024. -.TP +.TP limit Upper limit of the SFQ. Can be used to reduce the default length of 127 packets. After linux-3.3, it can be raised. @@ -97,12 +97,12 @@ depth Limit of packets per flow (after linux-3.3). Default to 127 and can be lowered. .TP perturb -Interval in seconds for queue algorithm perturbation. Defaults to 0, which means that +Interval in seconds for queue algorithm perturbation. Defaults to 0, which means that no perturbation occurs. Do not set too low for each perturbation may cause some packet reordering or losses. Advised value: 60 This value has no effect when external flow classification is used. Its better to increase divisor value to lower risk of hash collisions. -.TP +.TP quantum Amount of bytes a flow is allowed to dequeue during a round of the round robin process. Defaults to the MTU of the interface which is also the advised value and the minimum value. @@ -142,7 +142,7 @@ Specified in bytes. Used with burst to determine the time constant for average q burst Used for determining how fast the average queue size is influenced by the real queue size. .nf -Default value is : +Default value is : .B (2 * min + max) / (3 * avpkt) .fi .TP @@ -166,16 +166,16 @@ To attach to device ppp0: .P # tc qdisc add dev ppp0 root sfq .P -Please note that SFQ, like all non-shaping (work-conserving) qdiscs, is only useful +Please note that SFQ, like all non-shaping (work-conserving) qdiscs, is only useful if it owns the queue. -This is the case when the link speed equals the actually available bandwidth. This holds -for regular phone modems, ISDN connections and direct non-switched ethernet links. +This is the case when the link speed equals the actually available bandwidth. This holds +for regular phone modems, ISDN connections and direct non-switched ethernet links. .P -Most often, cable modems and DSL devices do not fall into this category. The same holds -for when connected to a switch and trying to send data to a congested segment also +Most often, cable modems and DSL devices do not fall into this category. The same holds +for when connected to a switch and trying to send data to a congested segment also connected to the switch. .P -In this case, the effective queue does not reside within Linux and is therefore not +In this case, the effective queue does not reside within Linux and is therefore not available for scheduling. .P Embed SFQ in a classful qdisc to make sure it owns the queue. @@ -191,11 +191,11 @@ changed the sfq default of 1024, use the same value for the flow hash filter, to .P Example of sfq with optional RED mode : .P -# tc qdisc add dev eth0 parent 1:1 handle 10: sfq limit 3000 flows 512 divisor 16384 +# tc qdisc add dev eth0 parent 1:1 handle 10: sfq limit 3000 flows 512 divisor 16384 redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn headdrop .SH SOURCE -.TP +.TP o Paul E. McKenney "Stochastic Fairness Queuing", IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. @@ -205,7 +205,7 @@ o Paul E. McKenney "Stochastic Fairness Queuing", "Interworking: Research and Experience", v.2, 1991, p.113-131. -.TP +.TP o See also: M. Shreedhar and George Varghese "Efficient Fair @@ -220,5 +220,3 @@ Alexey N. Kuznetsov, , Eric Dumazet . .P This manpage maintained by bert hubert - - diff --git a/man/man8/tc-tbf.8 b/man/man8/tc-tbf.8 index fc2c8372..d721b5d9 100644 --- a/man/man8/tc-tbf.8 +++ b/man/man8/tc-tbf.8 @@ -6,11 +6,11 @@ tbf \- Token Bucket Filter rate .B burst bytes/cell -.B ( latency -ms +.B ( latency +ms .B | limit bytes -.B ) [ mpu +.B ) [ mpu bytes .B [ peakrate rate @@ -22,46 +22,46 @@ burst is also known as buffer and maxburst. mtu is also known as minburst. .SH DESCRIPTION The Token Bucket Filter is a classful queueing discipline available for -traffic control with the +traffic control with the .BR tc (8) command. TBF is a pure shaper and never schedules traffic. It is non-work-conserving and may throttle -itself, although packets are available, to ensure that the configured rate is not exceeded. -It is able to shape up to 1mbit/s of normal traffic with ideal minimal burstiness, +itself, although packets are available, to ensure that the configured rate is not exceeded. +It is able to shape up to 1mbit/s of normal traffic with ideal minimal burstiness, sending out data exactly at the configured rates. Much higher rates are possible but at the cost of losing the minimal burstiness. In that -case, data is on average dequeued at the configured rate but may be sent much faster at millisecond +case, data is on average dequeued at the configured rate but may be sent much faster at millisecond timescales. Because of further queues living in network adaptors, this is often not a problem. .SH ALGORITHM -As the name implies, traffic is filtered based on the expenditure of +As the name implies, traffic is filtered based on the expenditure of .B tokens. Tokens roughly correspond to bytes, with the additional constraint that each packet consumes some tokens, no matter how small it is. This reflects the fact that even a zero-sized packet occupies the link for some time. -On creation, the TBF is stocked with tokens which correspond to the amount of traffic that can be burst +On creation, the TBF is stocked with tokens which correspond to the amount of traffic that can be burst in one go. Tokens arrive at a steady rate, until the bucket is full. -If no tokens are available, packets are queued, up to a configured limit. The TBF now +If no tokens are available, packets are queued, up to a configured limit. The TBF now calculates the token deficit, and throttles until the first packet in the queue can be sent. -If it is not acceptable to burst out packets at maximum speed, a peakrate can be configured +If it is not acceptable to burst out packets at maximum speed, a peakrate can be configured to limit the speed at which the bucket empties. This peakrate is implemented as a second TBF with a very small bucket, so that it doesn't burst. -To achieve perfection, the second bucket may contain only a single packet, which leads to -the earlier mentioned 1mbit/s limit. +To achieve perfection, the second bucket may contain only a single packet, which leads to +the earlier mentioned 1mbit/s limit. This limit is caused by the fact that the kernel can only throttle for at minimum 1 'jiffy', which depends -on HZ as 1/HZ. For perfect shaping, only a single packet can get sent per jiffy - for HZ=100, this means 100 +on HZ as 1/HZ. For perfect shaping, only a single packet can get sent per jiffy - for HZ=100, this means 100 packets of on average 1000 bytes each, which roughly corresponds to 1mbit/s. .SH PARAMETERS -See +See .BR tc (8) for how to specify the units of these values. .TP @@ -71,30 +71,30 @@ available. You can also specify this the other way around by setting the latency parameter, which specifies the maximum amount of time a packet can sit in the TBF. The latter calculation takes into account the size of the bucket, the rate and possibly the peakrate (if set). These two parameters -are mutually exclusive. +are mutually exclusive. .TP burst Also known as buffer or maxburst. -Size of the bucket, in bytes. This is the maximum amount of bytes that tokens can be available for instantaneously. -In general, larger shaping rates require a larger buffer. For 10mbit/s on Intel, you need at least 10kbyte buffer +Size of the bucket, in bytes. This is the maximum amount of bytes that tokens can be available for instantaneously. +In general, larger shaping rates require a larger buffer. For 10mbit/s on Intel, you need at least 10kbyte buffer if you want to reach your configured rate! If your buffer is too small, packets may be dropped because more tokens arrive per timer tick than fit in your bucket. The minimum buffer size can be calculated by dividing the rate by HZ. -Token usage calculations are performed using a table which by default has a resolution of 8 packets. -This resolution can be changed by specifying the +Token usage calculations are performed using a table which by default has a resolution of 8 packets. +This resolution can be changed by specifying the .B cell size with the burst. For example, to specify a 6000 byte buffer with a 16 byte cell size, set a burst of 6000/16. You will probably never have to set this. Must be an integral power of 2. .TP mpu -A zero-sized packet does not use zero bandwidth. For ethernet, no packet uses less than 64 bytes. The Minimum Packet Unit +A zero-sized packet does not use zero bandwidth. For ethernet, no packet uses less than 64 bytes. The Minimum Packet Unit determines the minimal token usage (specified in bytes) for a packet. Defaults to zero. .TP rate -The speed knob. See remarks above about limits! See +The speed knob. See remarks above about limits! See .BR tc (8) for units. .PP @@ -112,7 +112,7 @@ Specifies the size of the peakrate bucket. For perfect accuracy, should be set t If a peakrate is needed, but some burstiness is acceptable, this size can be raised. A 3000 byte minburst allows around 3mbit/s of peakrate, given 1000 byte packets. -Like the regular burstsize you can also specify a +Like the regular burstsize you can also specify a .B cell size. .SH EXAMPLE & USAGE @@ -139,5 +139,3 @@ the limit/latency is not effective anymore. .SH AUTHOR Alexey N. Kuznetsov, . This manpage maintained by bert hubert - - diff --git a/man/man8/tc.8 b/man/man8/tc.8 index 7a1090b9..6275c4b3 100644 --- a/man/man8/tc.8 +++ b/man/man8/tc.8 @@ -732,4 +732,3 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2. .SH AUTHOR Manpage maintained by bert hubert (ahu@ds9a.nl) - From 23d6c997d9e199c83cd57fce343ba7d6ad2c587b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 23 Nov 2015 15:42:34 -0800 Subject: [PATCH 070/151] misc: remove extra blank line --- misc/ssfilter.h | 1 - 1 file changed, 1 deletion(-) diff --git a/misc/ssfilter.h b/misc/ssfilter.h index b20092bc..53922a84 100644 --- a/misc/ssfilter.h +++ b/misc/ssfilter.h @@ -20,4 +20,3 @@ struct ssfilter int ssfilter_parse(struct ssfilter **f, int argc, char **argv, FILE *fp); void *parse_hostcond(char *addr, bool is_port); - From f7b49a3fc7701c2b0cd11e30a64f9b9fea766c2c Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 6 Nov 2015 18:54:08 +0100 Subject: [PATCH 071/151] ip_common.h header cleanup - Drop 'extern' keyword from all function prototypes. - Make line breaking of print_* functions consistent. - Make print_ntable() and ipntable_reset_filter() static and remove their declaration. - Drop declaration of non-existent ipaddr_list() and iproute_monitor(). Signed-off-by: Phil Sutter --- ip/ip_common.h | 120 +++++++++++++++++++++++-------------------------- ip/ipntable.c | 4 +- 2 files changed, 58 insertions(+), 66 deletions(-) diff --git a/ip/ip_common.h b/ip/ip_common.h index f74face6..9a846df3 100644 --- a/ip/ip_common.h +++ b/ip/ip_common.h @@ -1,68 +1,60 @@ -extern int get_operstate(const char *name); -extern int print_linkinfo(const struct sockaddr_nl *who, - struct nlmsghdr *n, - void *arg); -extern int print_linkinfo_brief(const struct sockaddr_nl *who, - struct nlmsghdr *n, - void *arg); -extern int print_addrinfo(const struct sockaddr_nl *who, - struct nlmsghdr *n, - void *arg); -extern int print_addrlabel(const struct sockaddr_nl *who, - struct nlmsghdr *n, void *arg); -extern int print_neigh(const struct sockaddr_nl *who, - struct nlmsghdr *n, void *arg); -extern int print_ntable(const struct sockaddr_nl *who, - struct nlmsghdr *n, void *arg); -extern int ipaddr_list(int argc, char **argv); -extern int ipaddr_list_link(int argc, char **argv); -void ipaddr_get_vf_rate(int, int *, int *, int); -extern int iproute_monitor(int argc, char **argv); -extern void iplink_usage(void) __attribute__((noreturn)); - -extern void iproute_reset_filter(int ifindex); -extern void ipmroute_reset_filter(int ifindex); -extern void ipaddr_reset_filter(int oneline, int ifindex); -extern void ipneigh_reset_filter(int ifindex); -extern void ipntable_reset_filter(void); -extern void ipnetconf_reset_filter(int ifindex); - -extern int print_route(const struct sockaddr_nl *who, - struct nlmsghdr *n, void *arg); -extern int print_mroute(const struct sockaddr_nl *who, - struct nlmsghdr *n, void *arg); -extern int print_prefix(const struct sockaddr_nl *who, - struct nlmsghdr *n, void *arg); -extern int print_rule(const struct sockaddr_nl *who, - struct nlmsghdr *n, void *arg); -extern int print_netconf(const struct sockaddr_nl *who, - struct rtnl_ctrl_data *ctrl, +int get_operstate(const char *name); +int print_linkinfo(const struct sockaddr_nl *who, + struct nlmsghdr *n, void *arg); +int print_linkinfo_brief(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg); -extern void netns_map_init(void); -extern int print_nsid(const struct sockaddr_nl *who, - struct nlmsghdr *n, void *arg); -extern int do_ipaddr(int argc, char **argv); -extern int do_ipaddrlabel(int argc, char **argv); -extern int do_iproute(int argc, char **argv); -extern int do_iprule(int argc, char **argv); -extern int do_ipneigh(int argc, char **argv); -extern int do_ipntable(int argc, char **argv); -extern int do_iptunnel(int argc, char **argv); -extern int do_ip6tunnel(int argc, char **argv); -extern int do_iptuntap(int argc, char **argv); -extern int do_iplink(int argc, char **argv); -extern int do_ipmonitor(int argc, char **argv); -extern int do_multiaddr(int argc, char **argv); -extern int do_multiroute(int argc, char **argv); -extern int do_multirule(int argc, char **argv); -extern int do_netns(int argc, char **argv); -extern int do_xfrm(int argc, char **argv); -extern int do_ipl2tp(int argc, char **argv); -extern int do_ipfou(int argc, char **argv); -extern int do_tcp_metrics(int argc, char **argv); -extern int do_ipnetconf(int argc, char **argv); -extern int do_iptoken(int argc, char **argv); -extern int iplink_get(unsigned int flags, char *name, __u32 filt_mask); +int print_addrinfo(const struct sockaddr_nl *who, + struct nlmsghdr *n, void *arg); +int print_addrlabel(const struct sockaddr_nl *who, + struct nlmsghdr *n, void *arg); +int print_neigh(const struct sockaddr_nl *who, + struct nlmsghdr *n, void *arg); +int ipaddr_list_link(int argc, char **argv); +void ipaddr_get_vf_rate(int, int *, int *, int); +void iplink_usage(void) __attribute__((noreturn)); + +void iproute_reset_filter(int ifindex); +void ipmroute_reset_filter(int ifindex); +void ipaddr_reset_filter(int oneline, int ifindex); +void ipneigh_reset_filter(int ifindex); +void ipnetconf_reset_filter(int ifindex); + +int print_route(const struct sockaddr_nl *who, + struct nlmsghdr *n, void *arg); +int print_mroute(const struct sockaddr_nl *who, + struct nlmsghdr *n, void *arg); +int print_prefix(const struct sockaddr_nl *who, + struct nlmsghdr *n, void *arg); +int print_rule(const struct sockaddr_nl *who, + struct nlmsghdr *n, void *arg); +int print_netconf(const struct sockaddr_nl *who, + struct rtnl_ctrl_data *ctrl, + struct nlmsghdr *n, void *arg); +void netns_map_init(void); +int print_nsid(const struct sockaddr_nl *who, + struct nlmsghdr *n, void *arg); +int do_ipaddr(int argc, char **argv); +int do_ipaddrlabel(int argc, char **argv); +int do_iproute(int argc, char **argv); +int do_iprule(int argc, char **argv); +int do_ipneigh(int argc, char **argv); +int do_ipntable(int argc, char **argv); +int do_iptunnel(int argc, char **argv); +int do_ip6tunnel(int argc, char **argv); +int do_iptuntap(int argc, char **argv); +int do_iplink(int argc, char **argv); +int do_ipmonitor(int argc, char **argv); +int do_multiaddr(int argc, char **argv); +int do_multiroute(int argc, char **argv); +int do_multirule(int argc, char **argv); +int do_netns(int argc, char **argv); +int do_xfrm(int argc, char **argv); +int do_ipl2tp(int argc, char **argv); +int do_ipfou(int argc, char **argv); +int do_tcp_metrics(int argc, char **argv); +int do_ipnetconf(int argc, char **argv); +int do_iptoken(int argc, char **argv); +int iplink_get(unsigned int flags, char *name, __u32 filt_mask); static inline int rtm_get_table(struct rtmsg *r, struct rtattr **tb) { diff --git a/ip/ipntable.c b/ip/ipntable.c index 5e84b951..6eb84e79 100644 --- a/ip/ipntable.c +++ b/ip/ipntable.c @@ -349,7 +349,7 @@ static const char *ntable_strtime_delta(__u32 msec) return str; } -int print_ntable(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +static int print_ntable(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) { FILE *fp = (FILE*)arg; struct ndtmsg *ndtm = NLMSG_DATA(n); @@ -601,7 +601,7 @@ int print_ntable(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) return 0; } -void ipntable_reset_filter(void) +static void ipntable_reset_filter(void) { memset(&filter, 0, sizeof(filter)); } From 0198930b559fd04eb6410a087c2c3eab27c9ba03 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 23 Nov 2015 15:53:04 -0800 Subject: [PATCH 072/151] update kernel headers to 4.4-rc1 Post merge window changes --- include/linux/bpf.h | 45 ++++++++--------------------------------- include/linux/if_link.h | 1 + 2 files changed, 9 insertions(+), 37 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 37ba6017..2e2524d4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -63,50 +63,16 @@ struct bpf_insn { __s32 imm; /* signed immediate constant */ }; -/* BPF syscall commands */ +/* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { - /* create a map with given type and attributes - * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size) - * returns fd or negative error - * map is deleted when fd is closed - */ BPF_MAP_CREATE, - - /* lookup key in a given map - * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) - * Using attr->map_fd, attr->key, attr->value - * returns zero and stores found elem into value - * or negative error - */ BPF_MAP_LOOKUP_ELEM, - - /* create or update key/value pair in a given map - * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) - * Using attr->map_fd, attr->key, attr->value, attr->flags - * returns zero or negative error - */ BPF_MAP_UPDATE_ELEM, - - /* find and delete elem by key in a given map - * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) - * Using attr->map_fd, attr->key - * returns zero or negative error - */ BPF_MAP_DELETE_ELEM, - - /* lookup key in a given map and return next key - * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) - * Using attr->map_fd, attr->key, attr->next_key - * returns zero and stores next key or negative error - */ BPF_MAP_GET_NEXT_KEY, - - /* verify and load eBPF program - * prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size) - * Using attr->prog_type, attr->insns, attr->license - * returns fd or negative error - */ BPF_PROG_LOAD, + BPF_OBJ_PIN, + BPF_OBJ_GET, }; enum bpf_map_type { @@ -160,6 +126,11 @@ union bpf_attr { __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ }; + + struct { /* anonymous struct used by BPF_OBJ_* commands */ + __aligned_u64 pathname; + __u32 bpf_fd; + }; } __attribute__((aligned(8))); /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 31f584cb..5d206c71 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -459,6 +459,7 @@ enum { IFLA_GENEVE_TOS, IFLA_GENEVE_PORT, /* destination port */ IFLA_GENEVE_COLLECT_METADATA, + IFLA_GENEVE_REMOTE6, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) From 869fcabecc3f544c1ba17c8c2ee5a9c9296126dd Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 18 Nov 2015 16:57:46 +0100 Subject: [PATCH 073/151] lnstat: describe -s option in help output Signed-off-by: Phil Sutter --- misc/lnstat.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/misc/lnstat.c b/misc/lnstat.c index 1e547d06..529bc33d 100644 --- a/misc/lnstat.c +++ b/misc/lnstat.c @@ -73,7 +73,10 @@ static int usage(char *name, int exit_code) fprintf(stderr, "\t-i --interval \t" "Set interval to 'intv' seconds\n"); fprintf(stderr, "\t-k --keys k,k,k,...\tDisplay only keys specified\n"); - fprintf(stderr, "\t-s --subject [0-2]\t?\n"); + fprintf(stderr, "\t-s --subject [0-2]\tControl header printing:\n"); + fprintf(stderr, "\t\t\t\t0 = never\n"); + fprintf(stderr, "\t\t\t\t1 = once\n"); + fprintf(stderr, "\t\t\t\t2 = every 20 lines (default))\n"); fprintf(stderr, "\t-w --width n,n,n,...\tWidth for each field\n"); fprintf(stderr, "\n"); From fdb347f7fd6c36b270a8c571cbe9a124a281b2b5 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 18 Nov 2015 16:57:47 +0100 Subject: [PATCH 074/151] lnstat: fix header displaying mechanism The algorithm depends on the loop counter ('i') to increment by one in each iteration. Though if running endlessly (count==0), the counter was not incremented at all. Also change formatting of the header printing conditional a bit so it's hopefully easier to read. Fixes: e7e2913 ("lnstat: run indefinitely by default") Signed-off-by: Phil Sutter --- misc/lnstat.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/misc/lnstat.c b/misc/lnstat.c index 529bc33d..264c9531 100644 --- a/misc/lnstat.c +++ b/misc/lnstat.c @@ -359,21 +359,19 @@ int main(int argc, char **argv) if (interval < 1 ) interval = 1; - for (i = 0; i < count || !count; ) { + for (i = 0; i < count || !count; i++) { lnstat_update(lnstat_files); if (mode == MODE_JSON) print_json(stdout, lnstat_files, &fp); else { - if ((hdr > 1 && - (! (i % 20))) || (hdr == 1 && i == 0)) + if ((hdr > 1 && !(i % 20)) || + (hdr == 1 && i == 0)) print_hdr(stdout, header); print_line(stdout, lnstat_files, &fp); } fflush(stdout); if (i < count - 1 || !count) sleep(interval); - if (count) - ++i; } break; } From 6e2e2cf03a2fe5777ee75e943444a343756042c0 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 18 Nov 2015 12:46:42 +0100 Subject: [PATCH 075/151] bridge.8: document fdb replace command Despite commit 45a82e5 ("iproute vxlan add support for fdb replace command"), the 'fdb replace' command was not mentioned in bridge.8. Signed-off-by: Phil Sutter --- man/man8/bridge.8 | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index c9a550dd..98a92eb8 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -50,7 +50,7 @@ bridge \- show / manipulate bridge addresses and devices .IR DEV " ]" .ti -8 -.BR "bridge fdb" " { " add " | " append " | " del " } " +.BR "bridge fdb" " { " add " | " append " | " del " | " replace " } " .I LLADDR .B dev .IR DEV " { " @@ -407,6 +407,13 @@ This command removes an existing fdb entry. The arguments are the same as with .BR "bridge fdb add" , +.SS bridge fdb replace - replace a forwarding database entry +If no matching entry is found, a new one will be created instead. + +.PP +The arguments are the same as with +.BR "bridge fdb add" , + .SS bridge fdb show - list forwarding entries. This command displays the current forwarding table. From e149d4e84384f88965ce43a6390acf7ba356187c Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 5 Nov 2015 14:54:17 -0500 Subject: [PATCH 076/151] iproute2: Ignore EADDRNOTAVAIL errors during address flush operation I found recently that, if I disabled address promotion in the kernel, that ip addr flush dev would fail with an EADDRNOTAVAIL errno (though the flush operation would in fact flush all addresses from an interface properly) Whats happening is that, if I add a primary and multiple secondary addresses to an interface, the flush operation first ennumerates them all with a GETADDR | DUMP operation, then sends a delete request for each address. But the kernel, having promotion disabled, deletes all secondary addresses when the primary is removed. That means, that several delete requests may still be pending in the netlink request for addresses that have been removed on our behalf, resulting in EADDRNOTAVAIL return codes. It seems the simplest thing to do is to understand that EADDRUNAVAIL isn't a fatal outcome on a flush operation, as it just indicates that an address which you want to remove is already removed, so it can safely be ignored. Signed-off-by: Neil Horman CC: Stephen Hemminger CC: Alexey Kuznetsov --- ip/ipaddress.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index f290205b..05358c97 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -893,7 +893,17 @@ int print_linkinfo(const struct sockaddr_nl *who, static int flush_update(void) { - if (rtnl_send_check(&rth, filter.flushb, filter.flushp) < 0) { + + /* + * Note that the kernel may delete multiple addresses for one + * delete request (e.g. if ipv4 address promotion is disabled). + * Since a flush operation is really a series of delete requests + * its possible that we may request an address delete that has + * already been done by the kernel. Therefore, ignore EADDRNOTAVAIL + * errors returned from a flush request + */ + if ((rtnl_send_check(&rth, filter.flushb, filter.flushp) < 0) && + (errno != EADDRNOTAVAIL)) { perror("Failed to send flush request"); return -1; } From 32e93fb7f66d55d597b52ec3b10fd44a47784114 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 Nov 2015 00:39:29 +0100 Subject: [PATCH 077/151] {f,m}_bpf: allow for sharing maps This larger work addresses one of the bigger remaining issues on tc's eBPF frontend, that is, to allow for persistent file descriptors. Whenever tc parses the ELF object, extracts and loads maps into the kernel, these file descriptors will be out of reach after the tc instance exits. Meaning, for simple (unnested) programs which contain one or multiple maps, the kernel holds a reference, and they will live on inside the kernel until the program holding them is unloaded, but they will be out of reach for user space, even worse with (also multiple nested) tail calls. For this issue, we introduced the concept of an agent that can receive the set of file descriptors from the tc instance creating them, in order to be able to further inspect/update map data for a specific use case. However, while that is more tied towards specific applications, it still doesn't easily allow for sharing maps accross multiple tc instances and would require a daemon to be running in the background. F.e. when a map should be shared by two eBPF programs, one attached to ingress, one to egress, this currently doesn't work with the tc frontend. This work solves exactly that, i.e. if requested, maps can now be _arbitrarily_ shared between object files (PIN_GLOBAL_NS) or within a single object (but various program sections, PIN_OBJECT_NS) without "loosing" the file descriptor set. To make that happen, we use eBPF object pinning introduced in kernel commit b2197755b263 ("bpf: add support for persistent maps/progs") for exactly this purpose. The shipped examples/bpf/bpf_shared.c code from this patch can be easily applied, for instance, as: - classifier-classifier shared: tc filter add dev foo parent 1: bpf obj shared.o sec egress tc filter add dev foo parent ffff: bpf obj shared.o sec ingress - classifier-action shared (here: late binding to a dummy classifier): tc actions add action bpf obj shared.o sec egress pass index 42 tc filter add dev foo parent ffff: bpf obj shared.o sec ingress tc filter add dev foo parent 1: bpf bytecode '1,6 0 0 4294967295,' \ action bpf index 42 The toy example increments a shared counter on egress and dumps its value on ingress (if no sharing (PIN_NONE) would have been chosen, map value is 0, of course, due to the two map instances being created): [...] -0 [002] ..s. 38264.788234: : map val: 4 -0 [002] ..s. 38264.788919: : map val: 4 -0 [002] ..s. 38264.789599: : map val: 5 [...] ... thus if both sections reference the pinned map(s) in question, tc will take care of fetching the appropriate file descriptor. The patch has been tested extensively on both, classifier and action sides. Signed-off-by: Daniel Borkmann --- examples/bpf/bpf_funcs.h | 7 + examples/bpf/bpf_shared.c | 54 ++ examples/bpf/bpf_shared.h | 4 - include/bpf_elf.h | 6 + include/utils.h | 3 + tc/e_bpf.c | 18 +- tc/f_bpf.c | 131 +--- tc/m_bpf.c | 158 ++--- tc/tc_bpf.c | 1329 +++++++++++++++++++++++++++---------- tc/tc_bpf.h | 73 +- 10 files changed, 1140 insertions(+), 643 deletions(-) create mode 100644 examples/bpf/bpf_shared.c diff --git a/examples/bpf/bpf_funcs.h b/examples/bpf/bpf_funcs.h index 1545fa9d..1369401a 100644 --- a/examples/bpf/bpf_funcs.h +++ b/examples/bpf/bpf_funcs.h @@ -1,6 +1,10 @@ #ifndef __BPF_FUNCS__ #define __BPF_FUNCS__ +#include + +#include "../../include/bpf_elf.h" + /* Misc macros. */ #ifndef __maybe_unused # define __maybe_unused __attribute__ ((__unused__)) @@ -43,6 +47,9 @@ static unsigned int (*get_smp_processor_id)(void) __maybe_unused = static unsigned int (*get_prandom_u32)(void) __maybe_unused = (void *) BPF_FUNC_get_prandom_u32; +static int (*bpf_printk)(const char *fmt, int fmt_size, ...) __maybe_unused = + (void *) BPF_FUNC_trace_printk; + /* LLVM built-in functions that an eBPF C program may use to emit * BPF_LD_ABS and BPF_LD_IND instructions. */ diff --git a/examples/bpf/bpf_shared.c b/examples/bpf/bpf_shared.c new file mode 100644 index 00000000..a8dc39c7 --- /dev/null +++ b/examples/bpf/bpf_shared.c @@ -0,0 +1,54 @@ +#include + +#include "bpf_funcs.h" + +/* Minimal, stand-alone toy map pinning example: + * + * clang -target bpf -O2 [...] -o bpf_shared.o -c bpf_shared.c + * tc filter add dev foo parent 1: bpf obj bpf_shared.o sec egress + * tc filter add dev foo parent ffff: bpf obj bpf_shared.o sec ingress + * + * Both classifier will share the very same map instance in this example, + * so map content can be accessed from ingress *and* egress side! + * + * This example has a pinning of PIN_OBJECT_NS, so it's private and + * thus shared among various program sections within the object. + * + * A setting of PIN_GLOBAL_NS would place it into a global namespace, + * so that it can be shared among different object files. A setting + * of PIN_NONE (= 0) means no sharing, so each tc invocation a new map + * instance is being created. + */ + +struct bpf_elf_map __section("maps") map_sh = { + .type = BPF_MAP_TYPE_ARRAY, + .size_key = sizeof(int), + .size_value = sizeof(int), + .pinning = PIN_OBJECT_NS, /* or PIN_GLOBAL_NS, or PIN_NONE */ + .max_elem = 1, +}; + +__section("egress") int emain(struct __sk_buff *skb) +{ + int key = 0, *val; + + val = bpf_map_lookup_elem(&map_sh, &key); + if (val) + __sync_fetch_and_add(val, 1); + + return -1; +} + +__section("ingress") int imain(struct __sk_buff *skb) +{ + char fmt[] = "map val: %d\n"; + int key = 0, *val; + + val = bpf_map_lookup_elem(&map_sh, &key); + if (val) + bpf_printk(fmt, sizeof(fmt), *val); + + return -1; +} + +char __license[] __section("license") = "GPL"; diff --git a/examples/bpf/bpf_shared.h b/examples/bpf/bpf_shared.h index 46423eca..ea8f0147 100644 --- a/examples/bpf/bpf_shared.h +++ b/examples/bpf/bpf_shared.h @@ -1,10 +1,6 @@ #ifndef __BPF_SHARED__ #define __BPF_SHARED__ -#include - -#include "../../include/bpf_elf.h" - enum { BPF_MAP_ID_PROTO, BPF_MAP_ID_QUEUE, diff --git a/include/bpf_elf.h b/include/bpf_elf.h index 4bd6bb00..0690dd6a 100644 --- a/include/bpf_elf.h +++ b/include/bpf_elf.h @@ -21,6 +21,11 @@ #define ELF_MAX_MAPS 64 #define ELF_MAX_LICENSE_LEN 128 +/* Object pinning settings */ +#define PIN_NONE 0 +#define PIN_OBJECT_NS 1 +#define PIN_GLOBAL_NS 2 + /* ELF map definition */ struct bpf_elf_map { __u32 type; @@ -28,6 +33,7 @@ struct bpf_elf_map { __u32 size_value; __u32 max_elem; __u32 id; + __u8 pinning; }; #endif /* __BPF_ELF__ */ diff --git a/include/utils.h b/include/utils.h index 1d351490..5902a985 100644 --- a/include/utils.h +++ b/include/utils.h @@ -192,6 +192,9 @@ void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr *n); __attribute__ ((format (printf, (pos_str), (pos_args)))) #endif +#define _textify(x) #x +#define textify(x) _textify(x) + #define htonll(x) ((1==htonl(1)) ? (x) : ((uint64_t)htonl((x) & 0xFFFFFFFF) << 32) | htonl((x) >> 32)) #define ntohll(x) ((1==ntohl(1)) ? (x) : ((uint64_t)ntohl((x) & 0xFFFFFFFF) << 32) | ntohl((x) >> 32)) diff --git a/tc/e_bpf.c b/tc/e_bpf.c index 218ba404..1f386c36 100644 --- a/tc/e_bpf.c +++ b/tc/e_bpf.c @@ -26,7 +26,7 @@ static char *argv_default[] = { BPF_DEFAULT_CMD, NULL }; static void explain(void) { - fprintf(stderr, "Usage: ... bpf [ import UDS_FILE ] [ run CMD ]\n\n"); + fprintf(stderr, "Usage: ... bpf [ import UDS_FILE ] [ run CMD ] [ debug ]\n\n"); fprintf(stderr, "Where UDS_FILE provides the name of a unix domain socket file\n"); fprintf(stderr, "to import eBPF maps and the optional CMD denotes the command\n"); fprintf(stderr, "to be executed (default: \'%s\').\n", BPF_DEFAULT_CMD); @@ -58,17 +58,21 @@ static int parse_bpf(struct exec_util *eu, int argc, char **argv) NEXT_ARG(); argv_run = argv; break; - } else if (matches(*argv, "import") == 0 || - matches(*argv, "imp") == 0) { + } else if (matches(*argv, "import") == 0) { NEXT_ARG(); bpf_uds_name = *argv; + } else if (matches(*argv, "debug") == 0 || + matches(*argv, "dbg") == 0) { + if (bpf_trace_pipe()) + fprintf(stderr, + "No trace pipe, tracefs not mounted?\n"); + return -1; } else { explain(); return -1; } - argc--; - argv++; + NEXT_ARG_FWD(); } if (!bpf_uds_name) { @@ -142,6 +146,6 @@ err: } struct exec_util bpf_exec_util = { - .id = "bpf", - .parse_eopt = parse_bpf, + .id = "bpf", + .parse_eopt = parse_bpf, }; diff --git a/tc/f_bpf.c b/tc/f_bpf.c index ac77af58..afc2e582 100644 --- a/tc/f_bpf.c +++ b/tc/f_bpf.c @@ -11,19 +11,8 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include + +#include #include "utils.h" #include "tc_util.h" @@ -31,6 +20,13 @@ static const enum bpf_prog_type bpf_type = BPF_PROG_TYPE_SCHED_CLS; +static const int nla_tbl[BPF_NLA_MAX] = { + [BPF_NLA_OPS_LEN] = TCA_BPF_OPS_LEN, + [BPF_NLA_OPS] = TCA_BPF_OPS, + [BPF_NLA_FD] = TCA_BPF_FD, + [BPF_NLA_NAME] = TCA_BPF_NAME, +}; + static void explain(void) { fprintf(stderr, "Usage: ... bpf ...\n"); @@ -42,6 +38,7 @@ static void explain(void) fprintf(stderr, "eBPF use case:\n"); fprintf(stderr, " object-file FILE [ section CLS_NAME ] [ export UDS_FILE ]"); fprintf(stderr, " [ verbose ] [ direct-action ]\n"); + fprintf(stderr, " object-pinned FILE [ direct-action ]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Common remaining options:\n"); fprintf(stderr, " [ action ACTION_SPEC ]\n"); @@ -51,7 +48,8 @@ static void explain(void) fprintf(stderr, "c,t,f,k and s are decimals; s denotes number of 4-tuples\n"); fprintf(stderr, "\n"); fprintf(stderr, "Where FILE points to a file containing the BPF_BYTECODE string,\n"); - fprintf(stderr, "an ELF file containing eBPF map definitions and bytecode.\n"); + fprintf(stderr, "an ELF file containing eBPF map definitions and bytecode, or a\n"); + fprintf(stderr, "pinned eBPF program.\n"); fprintf(stderr, "\n"); fprintf(stderr, "Where CLS_NAME refers to the section name containing the\n"); fprintf(stderr, "classifier (default \'%s\').\n", bpf_default_section(bpf_type)); @@ -66,119 +64,38 @@ static void explain(void) static int bpf_parse_opt(struct filter_util *qu, char *handle, int argc, char **argv, struct nlmsghdr *n) { + const char *bpf_obj = NULL, *bpf_uds_name = NULL; struct tcmsg *t = NLMSG_DATA(n); - const char *bpf_uds_name = NULL; - const char *bpf_sec_name = NULL; unsigned int bpf_flags = 0; - char *bpf_obj = NULL; - struct rtattr *tail; bool seen_run = false; - long h = 0; + struct rtattr *tail; int ret = 0; if (argc == 0) return 0; if (handle) { - h = strtol(handle, NULL, 0); - if (h == LONG_MIN || h == LONG_MAX) { - fprintf(stderr, "Illegal handle \"%s\", must be " - "numeric.\n", handle); + if (get_u32(&t->tcm_handle, handle, 0)) { + fprintf(stderr, "Illegal \"handle\"\n"); return -1; } } - t->tcm_handle = h; - tail = (struct rtattr *)(((void *)n) + NLMSG_ALIGN(n->nlmsg_len)); addattr_l(n, MAX_MSG, TCA_OPTIONS, NULL, 0); while (argc > 0) { if (matches(*argv, "run") == 0) { - struct sock_filter bpf_ops[BPF_MAXINSNS]; - bool from_file, ebpf, bpf_verbose; - int ret; - NEXT_ARG(); opt_bpf: - bpf_sec_name = bpf_default_section(bpf_type); - bpf_verbose = false; - ebpf = false; seen_run = true; - - if (strcmp(*argv, "bytecode-file") == 0 || - strcmp(*argv, "bcf") == 0) { - from_file = true; - } else if (strcmp(*argv, "bytecode") == 0 || - strcmp(*argv, "bc") == 0) { - from_file = false; - } else if (strcmp(*argv, "object-file") == 0 || - strcmp(*argv, "obj") == 0) { - ebpf = true; - } else { - fprintf(stderr, "What is \"%s\"?\n", *argv); - explain(); + if (bpf_parse_common(&argc, &argv, nla_tbl, bpf_type, + &bpf_obj, &bpf_uds_name, n)) { + fprintf(stderr, "Failed to retrieve (e)BPF data!\n"); return -1; } - - NEXT_ARG(); - if (ebpf) { - bpf_uds_name = getenv(BPF_ENV_UDS); - bpf_obj = *argv; - - NEXT_ARG_FWD(); - - if (argc > 0 && - (strcmp(*argv, "section") == 0 || - strcmp(*argv, "sec") == 0)) { - NEXT_ARG(); - bpf_sec_name = *argv; - NEXT_ARG_FWD(); - } - if (argc > 0 && !bpf_uds_name && - (strcmp(*argv, "export") == 0 || - strcmp(*argv, "exp") == 0)) { - NEXT_ARG(); - bpf_uds_name = *argv; - NEXT_ARG_FWD(); - } - if (argc > 0 && - (strcmp(*argv, "verbose") == 0 || - strcmp(*argv, "verb") == 0)) { - bpf_verbose = true; - NEXT_ARG_FWD(); - } - - PREV_ARG(); - } - - ret = ebpf ? bpf_open_object(bpf_obj, bpf_type, bpf_sec_name, - bpf_verbose) : - bpf_parse_ops(argc, argv, bpf_ops, from_file); - if (ret < 0) { - fprintf(stderr, "%s\n", ebpf ? - "Could not load object" : - "Illegal \"bytecode\""); - return -1; - } - - if (ebpf) { - char bpf_name[256]; - - bpf_obj = basename(bpf_obj); - - snprintf(bpf_name, sizeof(bpf_name), "%s:[%s]", - bpf_obj, bpf_sec_name); - - addattr32(n, MAX_MSG, TCA_BPF_FD, ret); - addattrstrz(n, MAX_MSG, TCA_BPF_NAME, bpf_name); - } else { - addattr16(n, MAX_MSG, TCA_BPF_OPS_LEN, ret); - addattr_l(n, MAX_MSG, TCA_BPF_OPS, &bpf_ops, - ret * sizeof(struct sock_filter)); - } } else if (matches(*argv, "classid") == 0 || - strcmp(*argv, "flowid") == 0) { + matches(*argv, "flowid") == 0) { unsigned int handle; NEXT_ARG(); @@ -204,7 +121,7 @@ opt_bpf: return -1; } continue; - } else if (strcmp(*argv, "help") == 0) { + } else if (matches(*argv, "help") == 0) { explain(); return -1; } else { @@ -280,7 +197,7 @@ static int bpf_print_opt(struct filter_util *qu, FILE *f, } struct filter_util bpf_filter_util = { - .id = "bpf", - .parse_fopt = bpf_parse_opt, - .print_fopt = bpf_print_opt, + .id = "bpf", + .parse_fopt = bpf_parse_opt, + .print_fopt = bpf_print_opt, }; diff --git a/tc/m_bpf.c b/tc/m_bpf.c index fb4c3c7f..c5e2fa5b 100644 --- a/tc/m_bpf.c +++ b/tc/m_bpf.c @@ -12,20 +12,23 @@ #include #include -#include -#include -#include -#include + #include #include #include "utils.h" -#include "rt_names.h" #include "tc_util.h" #include "tc_bpf.h" static const enum bpf_prog_type bpf_type = BPF_PROG_TYPE_SCHED_ACT; +static const int nla_tbl[BPF_NLA_MAX] = { + [BPF_NLA_OPS_LEN] = TCA_ACT_BPF_OPS_LEN, + [BPF_NLA_OPS] = TCA_ACT_BPF_OPS, + [BPF_NLA_FD] = TCA_ACT_BPF_FD, + [BPF_NLA_NAME] = TCA_ACT_BPF_NAME, +}; + static void explain(void) { fprintf(stderr, "Usage: ... bpf ... [ index INDEX ]\n"); @@ -37,12 +40,14 @@ static void explain(void) fprintf(stderr, "eBPF use case:\n"); fprintf(stderr, " object-file FILE [ section ACT_NAME ] [ export UDS_FILE ]"); fprintf(stderr, " [ verbose ]\n"); + fprintf(stderr, " object-pinned FILE\n"); fprintf(stderr, "\n"); fprintf(stderr, "Where BPF_BYTECODE := \'s,c t f k,c t f k,c t f k,...\'\n"); fprintf(stderr, "c,t,f,k and s are decimals; s denotes number of 4-tuples\n"); fprintf(stderr, "\n"); fprintf(stderr, "Where FILE points to a file containing the BPF_BYTECODE string,\n"); - fprintf(stderr, "an ELF file containing eBPF map definitions and bytecode.\n"); + fprintf(stderr, "an ELF file containing eBPF map definitions and bytecode, or a\n"); + fprintf(stderr, "pinned eBPF program.\n"); fprintf(stderr, "\n"); fprintf(stderr, "Where ACT_NAME refers to the section name containing the\n"); fprintf(stderr, "action (default \'%s\').\n", bpf_default_section(bpf_type)); @@ -54,114 +59,40 @@ static void explain(void) fprintf(stderr, "explicitly specifies an action index upon creation.\n"); } -static void usage(void) +static int bpf_parse_opt(struct action_util *a, int *ptr_argc, char ***ptr_argv, + int tca_id, struct nlmsghdr *n) { - explain(); - exit(-1); -} - -static int parse_bpf(struct action_util *a, int *argc_p, char ***argv_p, - int tca_id, struct nlmsghdr *n) -{ - char **argv = *argv_p, bpf_name[256]; + const char *bpf_obj = NULL, *bpf_uds_name = NULL; + struct tc_act_bpf parm; + bool seen_run = false; struct rtattr *tail; - struct tc_act_bpf parm = { 0 }; - struct sock_filter bpf_ops[BPF_MAXINSNS]; - bool ebpf_fill = false, bpf_fill = false; - bool ebpf = false, seen_run = false; - const char *bpf_uds_name = NULL; - const char *bpf_sec_name = NULL; - char *bpf_obj = NULL; - int argc = *argc_p, ret = 0; - __u16 bpf_len = 0; - __u32 bpf_fd = 0; + int argc, ret = 0; + char **argv; + + argv = *ptr_argv; + argc = *ptr_argc; if (matches(*argv, "bpf") != 0) return -1; NEXT_ARG(); + tail = NLMSG_TAIL(n); + addattr_l(n, MAX_MSG, tca_id, NULL, 0); + while (argc > 0) { if (matches(*argv, "run") == 0) { - bool from_file, bpf_verbose; - int ret; - NEXT_ARG(); opt_bpf: - bpf_sec_name = bpf_default_section(bpf_type); - bpf_verbose = false; seen_run = true; - - if (strcmp(*argv, "bytecode-file") == 0 || - strcmp(*argv, "bcf") == 0) { - from_file = true; - } else if (strcmp(*argv, "bytecode") == 0 || - strcmp(*argv, "bc") == 0) { - from_file = false; - } else if (strcmp(*argv, "object-file") == 0 || - strcmp(*argv, "obj") == 0) { - ebpf = true; - } else { - fprintf(stderr, "unexpected \"%s\"\n", *argv); - explain(); + if (bpf_parse_common(&argc, &argv, nla_tbl, bpf_type, + &bpf_obj, &bpf_uds_name, n)) { + fprintf(stderr, "Failed to retrieve (e)BPF data!\n"); return -1; } - - NEXT_ARG(); - if (ebpf) { - bpf_uds_name = getenv(BPF_ENV_UDS); - bpf_obj = *argv; - - NEXT_ARG_FWD(); - - if (argc > 0 && - (strcmp(*argv, "section") == 0 || - strcmp(*argv, "sec") == 0)) { - NEXT_ARG(); - bpf_sec_name = *argv; - NEXT_ARG_FWD(); - } - if (argc > 0 && !bpf_uds_name && - (strcmp(*argv, "export") == 0 || - strcmp(*argv, "exp") == 0)) { - NEXT_ARG(); - bpf_uds_name = *argv; - NEXT_ARG_FWD(); - } - if (argc > 0 && - (strcmp(*argv, "verbose") == 0 || - strcmp(*argv, "verb") == 0)) { - bpf_verbose = true; - NEXT_ARG_FWD(); - } - - PREV_ARG(); - } - - ret = ebpf ? bpf_open_object(bpf_obj, bpf_type, bpf_sec_name, - bpf_verbose) : - bpf_parse_ops(argc, argv, bpf_ops, from_file); - if (ret < 0) { - fprintf(stderr, "%s\n", ebpf ? - "Could not load object" : - "Illegal \"bytecode\""); - return -1; - } - - if (ebpf) { - bpf_obj = basename(bpf_obj); - - snprintf(bpf_name, sizeof(bpf_name), "%s:[%s]", - bpf_obj, bpf_sec_name); - - bpf_fd = ret; - ebpf_fill = true; - } else { - bpf_len = ret; - bpf_fill = true; - } } else if (matches(*argv, "help") == 0) { - usage(); + explain(); + return -1; } else if (matches(*argv, "index") == 0) { break; } else { @@ -173,7 +104,9 @@ opt_bpf: NEXT_ARG_FWD(); } + memset(&parm, 0, sizeof(parm)); parm.action = TC_ACT_PIPE; + if (argc) { if (matches(*argv, "reclassify") == 0) { parm.action = TC_ACT_RECLASSIFY; @@ -207,32 +140,19 @@ opt_bpf: } } - tail = NLMSG_TAIL(n); - - addattr_l(n, MAX_MSG, tca_id, NULL, 0); addattr_l(n, MAX_MSG, TCA_ACT_BPF_PARMS, &parm, sizeof(parm)); - - if (ebpf_fill) { - addattr32(n, MAX_MSG, TCA_ACT_BPF_FD, bpf_fd); - addattrstrz(n, MAX_MSG, TCA_ACT_BPF_NAME, bpf_name); - } else if (bpf_fill) { - addattr16(n, MAX_MSG, TCA_ACT_BPF_OPS_LEN, bpf_len); - addattr_l(n, MAX_MSG, TCA_ACT_BPF_OPS, &bpf_ops, - bpf_len * sizeof(struct sock_filter)); - } - tail->rta_len = (char *)NLMSG_TAIL(n) - (char *)tail; - *argc_p = argc; - *argv_p = argv; - if (bpf_uds_name) ret = bpf_send_map_fds(bpf_uds_name, bpf_obj); + *ptr_argc = argc; + *ptr_argv = argv; + return ret; } -static int print_bpf(struct action_util *au, FILE *f, struct rtattr *arg) +static int bpf_print_opt(struct action_util *au, FILE *f, struct rtattr *arg) { struct rtattr *tb[TCA_ACT_BPF_MAX + 1]; struct tc_act_bpf *parm; @@ -249,7 +169,6 @@ static int print_bpf(struct action_util *au, FILE *f, struct rtattr *arg) } parm = RTA_DATA(tb[TCA_ACT_BPF_PARMS]); - fprintf(f, "bpf "); if (tb[TCA_ACT_BPF_NAME]) @@ -276,12 +195,11 @@ static int print_bpf(struct action_util *au, FILE *f, struct rtattr *arg) } fprintf(f, "\n "); - return 0; } struct action_util bpf_action_util = { - .id = "bpf", - .parse_aopt = parse_bpf, - .print_aopt = print_bpf, + .id = "bpf", + .parse_aopt = bpf_parse_opt, + .print_aopt = bpf_print_opt, }; diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c index 276871a5..bc7bc9ff 100644 --- a/tc/tc_bpf.c +++ b/tc/tc_bpf.c @@ -20,18 +20,25 @@ #include #include #include -#include -#include -#include -#include -#include -#include #ifdef HAVE_ELF #include #include #endif +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + #include "utils.h" #include "bpf_elf.h" @@ -40,9 +47,47 @@ #include "tc_util.h" #include "tc_bpf.h" -int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len, - char **bpf_string, bool *need_release, - const char separator) +#ifdef HAVE_ELF +static int bpf_obj_open(const char *path, enum bpf_prog_type type, + const char *sec, bool verbose); +#else +static int bpf_obj_open(const char *path, enum bpf_prog_type type, + const char *sec, bool verbose) +{ + fprintf(stderr, "No ELF library support compiled in.\n"); + errno = ENOSYS; + return -1; +} +#endif + +static inline __u64 bpf_ptr_to_u64(const void *ptr) +{ + return (__u64)(unsigned long)ptr; +} + +static int bpf(int cmd, union bpf_attr *attr, unsigned int size) +{ +#ifdef __NR_bpf + return syscall(__NR_bpf, cmd, attr, size); +#else + fprintf(stderr, "No bpf syscall, kernel headers too old?\n"); + errno = ENOSYS; + return -1; +#endif +} + +static int bpf_obj_get(const char *pathname) +{ + union bpf_attr attr = { + .pathname = bpf_ptr_to_u64(pathname), + }; + + return bpf(BPF_OBJ_GET, &attr, sizeof(attr)); +} + +static int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len, + char **bpf_string, bool *need_release, + const char separator) { char sp; @@ -90,8 +135,8 @@ int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len, return 0; } -int bpf_parse_ops(int argc, char **argv, struct sock_filter *bpf_ops, - bool from_file) +static int bpf_ops_parse(int argc, char **argv, struct sock_filter *bpf_ops, + bool from_file) { char *bpf_string, *token, separator = ','; int ret = 0, i = 0; @@ -135,7 +180,6 @@ int bpf_parse_ops(int argc, char **argv, struct sock_filter *bpf_ops, goto out; } ret = bpf_len; - out: if (need_release) free(bpf_string); @@ -161,6 +205,97 @@ void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len) ops[i].jf, ops[i].k); } +static int bpf_valid_mntpt(const char *mnt, unsigned long magic) +{ + struct statfs st_fs; + + if (statfs(mnt, &st_fs) < 0) + return -ENOENT; + if ((unsigned long)st_fs.f_type != magic) + return -ENOENT; + + return 0; +} + +static const char *bpf_find_mntpt(const char *fstype, unsigned long magic, + char *mnt, int len, + const char * const *known_mnts) +{ + const char * const *ptr; + char type[100]; + FILE *fp; + + if (known_mnts) { + ptr = known_mnts; + while (*ptr) { + if (bpf_valid_mntpt(*ptr, magic) == 0) { + strncpy(mnt, *ptr, len - 1); + mnt[len - 1] = 0; + return mnt; + } + ptr++; + } + } + + fp = fopen("/proc/mounts", "r"); + if (fp == NULL || len != PATH_MAX) + return NULL; + + while (fscanf(fp, "%*s %" textify(PATH_MAX) "s %99s %*s %*d %*d\n", + mnt, type) == 2) { + if (strcmp(type, fstype) == 0) + break; + } + + fclose(fp); + if (strcmp(type, fstype) != 0) + return NULL; + + return mnt; +} + +int bpf_trace_pipe(void) +{ + char tracefs_mnt[PATH_MAX] = TRACE_DIR_MNT; + static const char * const tracefs_known_mnts[] = { + TRACE_DIR_MNT, + "/sys/kernel/debug/tracing", + "/tracing", + "/trace", + 0, + }; + char tpipe[PATH_MAX]; + const char *mnt; + int fd; + + mnt = bpf_find_mntpt("tracefs", TRACEFS_MAGIC, tracefs_mnt, + sizeof(tracefs_mnt), tracefs_known_mnts); + if (!mnt) { + fprintf(stderr, "tracefs not mounted?\n"); + return -1; + } + + snprintf(tpipe, sizeof(tpipe), "%s/trace_pipe", mnt); + + fd = open(tpipe, O_RDONLY); + if (fd < 0) + return -1; + + fprintf(stderr, "Running! Hang up with ^C!\n\n"); + while (1) { + static char buff[4096]; + ssize_t ret; + + ret = read(fd, buff, sizeof(buff) - 1); + if (ret > 0) { + write(2, buff, ret); + fflush(stderr); + } + } + + return 0; +} + const char *bpf_default_section(const enum bpf_prog_type type) { switch (type) { @@ -173,18 +308,139 @@ const char *bpf_default_section(const enum bpf_prog_type type) } } +int bpf_parse_common(int *ptr_argc, char ***ptr_argv, const int *nla_tbl, + enum bpf_prog_type type, const char **ptr_object, + const char **ptr_uds_name, struct nlmsghdr *n) +{ + struct sock_filter opcodes[BPF_MAXINSNS]; + const char *file, *section, *uds_name; + char **argv = *ptr_argv; + int argc = *ptr_argc; + char annotation[256]; + bool verbose = false; + int ret; + enum bpf_mode { + CBPF_BYTECODE, + CBPF_FILE, + EBPF_OBJECT, + EBPF_PINNED, + } mode; + + if (matches(*argv, "bytecode") == 0 || + strcmp(*argv, "bc") == 0) { + mode = CBPF_BYTECODE; + } else if (matches(*argv, "bytecode-file") == 0 || + strcmp(*argv, "bcf") == 0) { + mode = CBPF_FILE; + } else if (matches(*argv, "object-file") == 0 || + strcmp(*argv, "obj") == 0) { + mode = EBPF_OBJECT; + } else if (matches(*argv, "object-pinned") == 0 || + matches(*argv, "pinned") == 0 || + matches(*argv, "fd") == 0) { + mode = EBPF_PINNED; + } else { + fprintf(stderr, "What mode is \"%s\"?\n", *argv); + return -1; + } + + NEXT_ARG(); + file = section = uds_name = NULL; + if (mode == EBPF_OBJECT || mode == EBPF_PINNED) { + file = *argv; + NEXT_ARG_FWD(); + + section = bpf_default_section(type); + if (argc > 0 && matches(*argv, "section") == 0) { + NEXT_ARG(); + section = *argv; + NEXT_ARG_FWD(); + } + + uds_name = getenv(BPF_ENV_UDS); + if (argc > 0 && !uds_name && + matches(*argv, "export") == 0) { + NEXT_ARG(); + uds_name = *argv; + NEXT_ARG_FWD(); + } + + if (argc > 0 && matches(*argv, "verbose") == 0) { + verbose = true; + NEXT_ARG_FWD(); + } + + PREV_ARG(); + } + + if (mode == CBPF_BYTECODE || mode == CBPF_FILE) + ret = bpf_ops_parse(argc, argv, opcodes, mode == CBPF_FILE); + else if (mode == EBPF_OBJECT) + ret = bpf_obj_open(file, type, section, verbose); + else if (mode == EBPF_PINNED) + ret = bpf_obj_get(file); + if (ret < 0) + return -1; + + if (mode == CBPF_BYTECODE || mode == CBPF_FILE) { + addattr16(n, MAX_MSG, nla_tbl[BPF_NLA_OPS_LEN], ret); + addattr_l(n, MAX_MSG, nla_tbl[BPF_NLA_OPS], opcodes, + ret * sizeof(struct sock_filter)); + } else if (mode == EBPF_OBJECT || mode == EBPF_PINNED) { + snprintf(annotation, sizeof(annotation), "%s:[%s]", + basename(file), mode == EBPF_PINNED ? "*fsobj" : + section); + + addattr32(n, MAX_MSG, nla_tbl[BPF_NLA_FD], ret); + addattrstrz(n, MAX_MSG, nla_tbl[BPF_NLA_NAME], annotation); + } + + *ptr_object = file; + *ptr_uds_name = uds_name; + + *ptr_argc = argc; + *ptr_argv = argv; + + return 0; +} + #ifdef HAVE_ELF +struct bpf_elf_prog { + enum bpf_prog_type type; + const struct bpf_insn *insns; + size_t size; + const char *license; +}; + +struct bpf_elf_ctx { + Elf *elf_fd; + GElf_Ehdr elf_hdr; + Elf_Data *sym_tab; + Elf_Data *str_tab; + int obj_fd; + int map_fds[ELF_MAX_MAPS]; + struct bpf_elf_map maps[ELF_MAX_MAPS]; + int sym_num; + int map_num; + bool *sec_done; + int sec_maps; + char license[ELF_MAX_LICENSE_LEN]; + enum bpf_prog_type type; + bool verbose; + struct bpf_elf_st stat; +}; + struct bpf_elf_sec_data { - GElf_Shdr sec_hdr; - char *sec_name; - Elf_Data *sec_data; + GElf_Shdr sec_hdr; + Elf_Data *sec_data; + const char *sec_name; }; struct bpf_map_data { - int *fds; - const char *obj; - struct bpf_elf_st *st; - struct bpf_elf_map *ent; + int *fds; + const char *obj; + struct bpf_elf_st *st; + struct bpf_elf_map *ent; }; /* If we provide a small buffer with log level enabled, the kernel @@ -193,15 +449,8 @@ struct bpf_map_data { * verifier we still want to hand something descriptive to the user. */ static char bpf_log_buf[65536]; -static bool bpf_verbose; -static struct bpf_elf_st bpf_st; - -static int map_fds[ELF_MAX_MAPS]; -static struct bpf_elf_map map_ent[ELF_MAX_MAPS]; - -static void bpf_dump_error(const char *format, ...) __check_format_string(1, 2); -static void bpf_dump_error(const char *format, ...) +static __check_format_string(1, 2) void bpf_dump_error(const char *format, ...) { va_list vl; @@ -215,46 +464,7 @@ static void bpf_dump_error(const char *format, ...) } } -static void bpf_save_finfo(int file_fd) -{ - struct stat st; - int ret; - - memset(&bpf_st, 0, sizeof(bpf_st)); - - ret = fstat(file_fd, &st); - if (ret < 0) { - fprintf(stderr, "Stat of elf file failed: %s\n", - strerror(errno)); - return; - } - - bpf_st.st_dev = st.st_dev; - bpf_st.st_ino = st.st_ino; -} - -static void bpf_clear_finfo(void) -{ - memset(&bpf_st, 0, sizeof(bpf_st)); -} - -static bool bpf_may_skip_map_creation(int file_fd) -{ - struct stat st; - int ret; - - ret = fstat(file_fd, &st); - if (ret < 0) { - fprintf(stderr, "Stat of elf file failed: %s\n", - strerror(errno)); - return false; - } - - return (bpf_st.st_dev == st.st_dev) && - (bpf_st.st_ino == st.st_ino); -} - -static int bpf_create_map(enum bpf_map_type type, unsigned int size_key, +static int bpf_map_create(enum bpf_map_type type, unsigned int size_key, unsigned int size_value, unsigned int max_elem) { union bpf_attr attr = { @@ -267,7 +477,7 @@ static int bpf_create_map(enum bpf_map_type type, unsigned int size_key, return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } -static int bpf_update_map(int fd, const void *key, const void *value, +static int bpf_map_update(int fd, const void *key, const void *value, uint64_t flags) { union bpf_attr attr = { @@ -281,121 +491,429 @@ static int bpf_update_map(int fd, const void *key, const void *value, } static int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns, - unsigned int len, const char *license) + size_t size, const char *license) { union bpf_attr attr = { .prog_type = type, .insns = bpf_ptr_to_u64(insns), - .insn_cnt = len / sizeof(struct bpf_insn), + .insn_cnt = size / sizeof(struct bpf_insn), .license = bpf_ptr_to_u64(license), .log_buf = bpf_ptr_to_u64(bpf_log_buf), .log_size = sizeof(bpf_log_buf), .log_level = 1, }; + if (getenv(BPF_ENV_NOLOG)) { + attr.log_buf = 0; + attr.log_size = 0; + attr.log_level = 0; + } + return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } -static int bpf_prog_attach(enum bpf_prog_type type, const char *sec, - const struct bpf_insn *insns, unsigned int size, - const char *license) +static int bpf_obj_pin(int fd, const char *pathname) { - int prog_fd = bpf_prog_load(type, insns, size, license); + union bpf_attr attr = { + .pathname = bpf_ptr_to_u64(pathname), + .bpf_fd = fd, + }; - if (prog_fd < 0 || bpf_verbose) { - bpf_dump_error("%s (section \'%s\'): %s\n", prog_fd < 0 ? - "BPF program rejected" : - "BPF program verification", - sec, strerror(errno)); + return bpf(BPF_OBJ_PIN, &attr, sizeof(attr)); +} + +static int bpf_obj_hash(const char *object, uint8_t *out, size_t len) +{ + struct sockaddr_alg alg = { + .salg_family = AF_ALG, + .salg_type = "hash", + .salg_name = "sha1", + }; + int ret, cfd, ofd, ffd; + struct stat stbuff; + ssize_t size; + + if (!object || len != 20) + return -EINVAL; + + cfd = socket(AF_ALG, SOCK_SEQPACKET, 0); + if (cfd < 0) { + fprintf(stderr, "Cannot get AF_ALG socket: %s\n", + strerror(errno)); + return cfd; } - return prog_fd; -} - -static int bpf_map_attach(enum bpf_map_type type, unsigned int size_key, - unsigned int size_value, unsigned int max_elem) -{ - int map_fd = bpf_create_map(type, size_key, size_value, max_elem); - - if (map_fd < 0) - bpf_dump_error("BPF map rejected: %s\n", strerror(errno)); - - return map_fd; -} - -static void bpf_maps_init(void) -{ - int i; - - memset(map_ent, 0, sizeof(map_ent)); - for (i = 0; i < ARRAY_SIZE(map_fds); i++) - map_fds[i] = -1; -} - -static int bpf_maps_count(void) -{ - int i, count = 0; - - for (i = 0; i < ARRAY_SIZE(map_fds); i++) { - if (map_fds[i] < 0) - break; - count++; + ret = bind(cfd, (struct sockaddr *)&alg, sizeof(alg)); + if (ret < 0) { + fprintf(stderr, "Error binding socket: %s\n", strerror(errno)); + goto out_cfd; } - return count; -} - -static void bpf_maps_destroy(void) -{ - int i; - - memset(map_ent, 0, sizeof(map_ent)); - for (i = 0; i < ARRAY_SIZE(map_fds); i++) { - if (map_fds[i] >= 0) - close(map_fds[i]); - } -} - -static int bpf_maps_attach(struct bpf_elf_map *maps, unsigned int num_maps) -{ - int i, ret; - - for (i = 0; (i < num_maps) && (num_maps <= ARRAY_SIZE(map_fds)); i++) { - struct bpf_elf_map *map = &maps[i]; - - ret = bpf_map_attach(map->type, map->size_key, - map->size_value, map->max_elem); - if (ret < 0) - goto err_unwind; - - map_fds[i] = ret; + ofd = accept(cfd, NULL, 0); + if (ofd < 0) { + fprintf(stderr, "Error accepting socket: %s\n", + strerror(errno)); + ret = ofd; + goto out_cfd; } - return 0; + ffd = open(object, O_RDONLY); + if (ffd < 0) { + fprintf(stderr, "Error opening object %s: %s\n", + object, strerror(errno)); + ret = ffd; + goto out_ofd; + } -err_unwind: - bpf_maps_destroy(); + ret = fstat(ffd, &stbuff); + if (ret < 0) { + fprintf(stderr, "Error doing fstat: %s\n", + strerror(errno)); + goto out_ffd; + } + + size = sendfile(ofd, ffd, NULL, stbuff.st_size); + if (size != stbuff.st_size) { + fprintf(stderr, "Error from sendfile (%zd vs %zu bytes): %s\n", + size, stbuff.st_size, strerror(errno)); + ret = -1; + goto out_ffd; + } + + size = read(ofd, out, len); + if (size != len) { + fprintf(stderr, "Error from read (%zd vs %zu bytes): %s\n", + size, len, strerror(errno)); + ret = -1; + } else { + ret = 0; + } +out_ffd: + close(ffd); +out_ofd: + close(ofd); +out_cfd: + close(cfd); return ret; } -static int bpf_fill_section_data(Elf *elf_fd, GElf_Ehdr *elf_hdr, int sec_index, - struct bpf_elf_sec_data *sec_data) +static const char *bpf_get_obj_uid(const char *pathname) { + static bool bpf_uid_cached = false; + static char bpf_uid[64]; + uint8_t tmp[20]; + int ret; + + if (bpf_uid_cached) + goto done; + + ret = bpf_obj_hash(pathname, tmp, sizeof(tmp)); + if (ret) { + fprintf(stderr, "Object hashing failed!\n"); + return NULL; + } + + hexstring_n2a(tmp, sizeof(tmp), bpf_uid, sizeof(bpf_uid)); + bpf_uid_cached = true; +done: + return bpf_uid; +} + +static int bpf_mnt_fs(const char *target) +{ + bool bind_done = false; + + while (mount("", target, "none", MS_PRIVATE | MS_REC, NULL)) { + if (errno != EINVAL || bind_done) { + fprintf(stderr, "mount --make-private %s failed: %s\n", + target, strerror(errno)); + return -1; + } + + if (mount(target, target, "none", MS_BIND, NULL)) { + fprintf(stderr, "mount --bind %s %s failed: %s\n", + target, target, strerror(errno)); + return -1; + } + + bind_done = true; + } + + if (mount("bpf", target, "bpf", 0, NULL)) { + fprintf(stderr, "mount -t bpf bpf %s failed: %s\n", + target, strerror(errno)); + return -1; + } + + return 0; +} + +static const char *bpf_get_tc_dir(void) +{ + static bool bpf_mnt_cached = false; + static char bpf_tc_dir[PATH_MAX]; + static const char *mnt; + static const char * const bpf_known_mnts[] = { + BPF_DIR_MNT, + 0, + }; + char bpf_mnt[PATH_MAX] = BPF_DIR_MNT; + char bpf_glo_dir[PATH_MAX]; + int ret; + + if (bpf_mnt_cached) + goto done; + + mnt = bpf_find_mntpt("bpf", BPF_FS_MAGIC, bpf_mnt, sizeof(bpf_mnt), + bpf_known_mnts); + if (!mnt) { + mnt = getenv(BPF_ENV_MNT); + if (!mnt) + mnt = BPF_DIR_MNT; + ret = bpf_mnt_fs(mnt); + if (ret) { + mnt = NULL; + goto out; + } + } + + snprintf(bpf_tc_dir, sizeof(bpf_tc_dir), "%s/%s", mnt, BPF_DIR_TC); + ret = mkdir(bpf_tc_dir, S_IRWXU); + if (ret && errno != EEXIST) { + fprintf(stderr, "mkdir %s failed: %s\n", bpf_tc_dir, + strerror(errno)); + mnt = NULL; + goto out; + } + + snprintf(bpf_glo_dir, sizeof(bpf_glo_dir), "%s/%s", + bpf_tc_dir, BPF_DIR_GLOBALS); + ret = mkdir(bpf_glo_dir, S_IRWXU); + if (ret && errno != EEXIST) { + fprintf(stderr, "mkdir %s failed: %s\n", bpf_glo_dir, + strerror(errno)); + mnt = NULL; + goto out; + } + + mnt = bpf_tc_dir; +out: + bpf_mnt_cached = true; +done: + return mnt; +} + +static int bpf_init_env(const char *pathname) +{ + struct rlimit limit = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; + + /* Don't bother in case we fail! */ + setrlimit(RLIMIT_MEMLOCK, &limit); + + if (!bpf_get_tc_dir()) { + fprintf(stderr, "Continuing without mounted eBPF fs. " + "Too old kernel?\n"); + return 0; + } + + if (!bpf_get_obj_uid(pathname)) + return -1; + + return 0; +} + +static bool bpf_no_pinning(int pinning) +{ + switch (pinning) { + case PIN_OBJECT_NS: + case PIN_GLOBAL_NS: + return false; + case PIN_NONE: + default: + return true; + } +} + +static void bpf_make_pathname(char *pathname, size_t len, const char *name, + int pinning) +{ + switch (pinning) { + case PIN_OBJECT_NS: + snprintf(pathname, len, "%s/%s/%s", bpf_get_tc_dir(), + bpf_get_obj_uid(NULL), name); + break; + case PIN_GLOBAL_NS: + snprintf(pathname, len, "%s/%s/%s", bpf_get_tc_dir(), + BPF_DIR_GLOBALS, name); + break; + } +} + +static int bpf_probe_pinned(const char *name, int pinning) +{ + char pathname[PATH_MAX]; + + if (bpf_no_pinning(pinning) || !bpf_get_tc_dir()) + return 0; + + bpf_make_pathname(pathname, sizeof(pathname), name, pinning); + return bpf_obj_get(pathname); +} + +static int bpf_place_pinned(int fd, const char *name, int pinning) +{ + char pathname[PATH_MAX]; + int ret; + + if (bpf_no_pinning(pinning) || !bpf_get_tc_dir()) + return 0; + + if (pinning == PIN_OBJECT_NS) { + snprintf(pathname, sizeof(pathname), "%s/%s", + bpf_get_tc_dir(), bpf_get_obj_uid(NULL)); + + ret = mkdir(pathname, S_IRWXU); + if (ret && errno != EEXIST) { + fprintf(stderr, "mkdir %s failed: %s\n", pathname, + strerror(errno)); + return ret; + } + } + + bpf_make_pathname(pathname, sizeof(pathname), name, pinning); + return bpf_obj_pin(fd, pathname); +} + +static int bpf_prog_attach(const char *section, + const struct bpf_elf_prog *prog, bool verbose) +{ + int fd; + + /* We can add pinning here later as well, same as bpf_map_attach(). */ + errno = 0; + fd = bpf_prog_load(prog->type, prog->insns, prog->size, + prog->license); + if (fd < 0 || verbose) { + bpf_dump_error("Prog section \'%s\' (type:%u insns:%zu " + "license:\'%s\') %s%s (%d)!\n\n", + section, prog->type, + prog->size / sizeof(struct bpf_insn), + prog->license, fd < 0 ? "rejected :" : + "loaded", fd < 0 ? strerror(errno) : "", + fd < 0 ? errno : fd); + } + + return fd; +} + +static int bpf_map_attach(const char *name, const struct bpf_elf_map *map, + bool verbose) +{ + int fd, ret; + + fd = bpf_probe_pinned(name, map->pinning); + if (fd > 0) { + if (verbose) + fprintf(stderr, "Map \'%s\' loaded as pinned!\n", + name); + return fd; + } + + errno = 0; + fd = bpf_map_create(map->type, map->size_key, map->size_value, + map->max_elem); + if (fd < 0 || verbose) { + bpf_dump_error("Map \'%s\' (type:%u id:%u pinning:%u " + "ksize:%u vsize:%u max-elems:%u) %s%s (%d)!\n", + name, map->type, map->id, map->pinning, + map->size_key, map->size_value, map->max_elem, + fd < 0 ? "rejected: " : "loaded", fd < 0 ? + strerror(errno) : "", fd < 0 ? errno : fd); + if (fd < 0) + return fd; + } + + ret = bpf_place_pinned(fd, name, map->pinning); + if (ret < 0 && errno != EEXIST) { + fprintf(stderr, "Could not pin %s map: %s\n", name, + strerror(errno)); + close(fd); + return ret; + } + + return fd; +} + +#define __ELF_ST_BIND(x) ((x) >> 4) +#define __ELF_ST_TYPE(x) (((unsigned int) x) & 0xf) + +static const char *bpf_str_tab_name(const struct bpf_elf_ctx *ctx, + const GElf_Sym *sym) +{ + return ctx->str_tab->d_buf + sym->st_name; +} + +static const char *bpf_map_fetch_name(struct bpf_elf_ctx *ctx, int which) +{ + GElf_Sym sym; + int i; + + for (i = 0; i < ctx->sym_num; i++) { + if (gelf_getsym(ctx->sym_tab, i, &sym) != &sym) + continue; + + if (__ELF_ST_BIND(sym.st_info) != STB_GLOBAL || + __ELF_ST_TYPE(sym.st_info) != STT_NOTYPE || + sym.st_shndx != ctx->sec_maps || + sym.st_value / sizeof(struct bpf_elf_map) != which) + continue; + + return bpf_str_tab_name(ctx, &sym); + } + + return NULL; +} + +static int bpf_maps_attach_all(struct bpf_elf_ctx *ctx) +{ + const char *map_name; + int i, fd; + + for (i = 0; i < ctx->map_num; i++) { + map_name = bpf_map_fetch_name(ctx, i); + if (!map_name) + return -EIO; + + fd = bpf_map_attach(map_name, &ctx->maps[i], ctx->verbose); + if (fd < 0) + return fd; + + ctx->map_fds[i] = fd; + } + + return 0; +} + +static int bpf_fill_section_data(struct bpf_elf_ctx *ctx, int section, + struct bpf_elf_sec_data *data) +{ + Elf_Data *sec_edata; GElf_Shdr sec_hdr; Elf_Scn *sec_fd; - Elf_Data *sec_edata; char *sec_name; - memset(sec_data, 0, sizeof(*sec_data)); + memset(data, 0, sizeof(*data)); - sec_fd = elf_getscn(elf_fd, sec_index); + sec_fd = elf_getscn(ctx->elf_fd, section); if (!sec_fd) return -EINVAL; - if (gelf_getshdr(sec_fd, &sec_hdr) != &sec_hdr) return -EIO; - sec_name = elf_strptr(elf_fd, elf_hdr->e_shstrndx, + sec_name = elf_strptr(ctx->elf_fd, ctx->elf_hdr.e_shstrndx, sec_hdr.sh_name); if (!sec_name || !sec_hdr.sh_size) return -ENOENT; @@ -404,16 +922,131 @@ static int bpf_fill_section_data(Elf *elf_fd, GElf_Ehdr *elf_hdr, int sec_index, if (!sec_edata || elf_getdata(sec_fd, sec_edata)) return -EIO; - memcpy(&sec_data->sec_hdr, &sec_hdr, sizeof(sec_hdr)); - sec_data->sec_name = sec_name; - sec_data->sec_data = sec_edata; + memcpy(&data->sec_hdr, &sec_hdr, sizeof(sec_hdr)); + data->sec_name = sec_name; + data->sec_data = sec_edata; return 0; } -static int bpf_apply_relo_data(struct bpf_elf_sec_data *data_relo, - struct bpf_elf_sec_data *data_insn, - Elf_Data *sym_tab) +static int bpf_fetch_maps(struct bpf_elf_ctx *ctx, int section, + struct bpf_elf_sec_data *data) +{ + if (data->sec_data->d_size % sizeof(struct bpf_elf_map) != 0) + return -EINVAL; + + ctx->map_num = data->sec_data->d_size / sizeof(struct bpf_elf_map); + ctx->sec_maps = section; + ctx->sec_done[section] = true; + + if (ctx->map_num > ARRAY_SIZE(ctx->map_fds)) { + fprintf(stderr, "Too many BPF maps in ELF section!\n"); + return -ENOMEM; + } + + memcpy(ctx->maps, data->sec_data->d_buf, data->sec_data->d_size); + return 0; +} + +static int bpf_fetch_license(struct bpf_elf_ctx *ctx, int section, + struct bpf_elf_sec_data *data) +{ + if (data->sec_data->d_size > sizeof(ctx->license)) + return -ENOMEM; + + memcpy(ctx->license, data->sec_data->d_buf, data->sec_data->d_size); + ctx->sec_done[section] = true; + return 0; +} + +static int bpf_fetch_symtab(struct bpf_elf_ctx *ctx, int section, + struct bpf_elf_sec_data *data) +{ + ctx->sym_tab = data->sec_data; + ctx->sym_num = data->sec_hdr.sh_size / data->sec_hdr.sh_entsize; + ctx->sec_done[section] = true; + return 0; +} + +static int bpf_fetch_strtab(struct bpf_elf_ctx *ctx, int section, + struct bpf_elf_sec_data *data) +{ + ctx->str_tab = data->sec_data; + ctx->sec_done[section] = true; + return 0; +} + +static int bpf_fetch_ancillary(struct bpf_elf_ctx *ctx) +{ + struct bpf_elf_sec_data data; + int i, ret = -1; + + for (i = 1; i < ctx->elf_hdr.e_shnum; i++) { + ret = bpf_fill_section_data(ctx, i, &data); + if (ret < 0) + continue; + + if (!strcmp(data.sec_name, ELF_SECTION_MAPS)) + ret = bpf_fetch_maps(ctx, i, &data); + else if (!strcmp(data.sec_name, ELF_SECTION_LICENSE)) + ret = bpf_fetch_license(ctx, i, &data); + else if (data.sec_hdr.sh_type == SHT_SYMTAB) + ret = bpf_fetch_symtab(ctx, i, &data); + else if (data.sec_hdr.sh_type == SHT_STRTAB && + i != ctx->elf_hdr.e_shstrndx) + ret = bpf_fetch_strtab(ctx, i, &data); + if (ret < 0) { + fprintf(stderr, "Error parsing section %d! Perhaps" + "check with readelf -a?\n", i); + break; + } + } + + if (ctx->sym_tab && ctx->str_tab && ctx->sec_maps) { + ret = bpf_maps_attach_all(ctx); + if (ret < 0) { + fprintf(stderr, "Error loading maps into kernel!\n"); + return ret; + } + } + + return ret; +} + +static int bpf_fetch_prog(struct bpf_elf_ctx *ctx, const char *section) +{ + struct bpf_elf_sec_data data; + struct bpf_elf_prog prog; + int ret, i, fd = -1; + + for (i = 1; i < ctx->elf_hdr.e_shnum; i++) { + if (ctx->sec_done[i]) + continue; + + ret = bpf_fill_section_data(ctx, i, &data); + if (ret < 0 || strcmp(data.sec_name, section)) + continue; + + memset(&prog, 0, sizeof(prog)); + prog.type = ctx->type; + prog.insns = data.sec_data->d_buf; + prog.size = data.sec_data->d_size; + prog.license = ctx->license; + + fd = bpf_prog_attach(section, &prog, ctx->verbose); + if (fd < 0) + continue; + + ctx->sec_done[i] = true; + break; + } + + return fd; +} + +static int bpf_apply_relo_data(struct bpf_elf_ctx *ctx, + struct bpf_elf_sec_data *data_relo, + struct bpf_elf_sec_data *data_insn) { Elf_Data *idata = data_insn->sec_data; GElf_Shdr *rhdr = &data_relo->sec_hdr; @@ -422,7 +1055,7 @@ static int bpf_apply_relo_data(struct bpf_elf_sec_data *data_relo, unsigned int num_insns = idata->d_size / sizeof(*insns); for (relo_ent = 0; relo_ent < relo_num; relo_ent++) { - unsigned int ioff, fnum; + unsigned int ioff, rmap; GElf_Rel relo; GElf_Sym sym; @@ -430,291 +1063,254 @@ static int bpf_apply_relo_data(struct bpf_elf_sec_data *data_relo, return -EIO; ioff = relo.r_offset / sizeof(struct bpf_insn); - if (ioff >= num_insns) - return -EINVAL; - if (insns[ioff].code != (BPF_LD | BPF_IMM | BPF_DW)) + if (ioff >= num_insns || + insns[ioff].code != (BPF_LD | BPF_IMM | BPF_DW)) return -EINVAL; - if (gelf_getsym(sym_tab, GELF_R_SYM(relo.r_info), &sym) != &sym) + if (gelf_getsym(ctx->sym_tab, GELF_R_SYM(relo.r_info), &sym) != &sym) return -EIO; - fnum = sym.st_value / sizeof(struct bpf_elf_map); - if (fnum >= ARRAY_SIZE(map_fds)) + rmap = sym.st_value / sizeof(struct bpf_elf_map); + if (rmap >= ARRAY_SIZE(ctx->map_fds)) return -EINVAL; - if (map_fds[fnum] < 0) + if (!ctx->map_fds[rmap]) return -EINVAL; + if (ctx->verbose) + fprintf(stderr, "Map \'%s\' (%d) injected into prog " + "section \'%s\' at offset %u!\n", + bpf_str_tab_name(ctx, &sym), ctx->map_fds[rmap], + data_insn->sec_name, ioff); + insns[ioff].src_reg = BPF_PSEUDO_MAP_FD; - insns[ioff].imm = map_fds[fnum]; + insns[ioff].imm = ctx->map_fds[rmap]; } return 0; } -static int bpf_fetch_ancillary(int file_fd, Elf *elf_fd, GElf_Ehdr *elf_hdr, - bool *sec_done, char *license, unsigned int lic_len, - Elf_Data **sym_tab) +static int bpf_fetch_prog_relo(struct bpf_elf_ctx *ctx, const char *section) { - int sec_index, ret = -1; + struct bpf_elf_sec_data data_relo, data_insn; + struct bpf_elf_prog prog; + int ret, idx, i, fd = -1; - for (sec_index = 1; sec_index < elf_hdr->e_shnum; sec_index++) { - struct bpf_elf_sec_data data_anc; - - ret = bpf_fill_section_data(elf_fd, elf_hdr, sec_index, - &data_anc); - if (ret < 0) - continue; - - /* Extract and load eBPF map fds. */ - if (!strcmp(data_anc.sec_name, ELF_SECTION_MAPS) && - !bpf_may_skip_map_creation(file_fd)) { - struct bpf_elf_map *maps; - unsigned int maps_num; - - if (data_anc.sec_data->d_size % sizeof(*maps) != 0) - return -EINVAL; - - maps = data_anc.sec_data->d_buf; - maps_num = data_anc.sec_data->d_size / sizeof(*maps); - memcpy(map_ent, maps, data_anc.sec_data->d_size); - - ret = bpf_maps_attach(maps, maps_num); - if (ret < 0) - return ret; - - sec_done[sec_index] = true; - } - /* Extract eBPF license. */ - else if (!strcmp(data_anc.sec_name, ELF_SECTION_LICENSE)) { - if (data_anc.sec_data->d_size > lic_len) - return -ENOMEM; - - sec_done[sec_index] = true; - memcpy(license, data_anc.sec_data->d_buf, - data_anc.sec_data->d_size); - } - /* Extract symbol table for relocations (map fd fixups). */ - else if (data_anc.sec_hdr.sh_type == SHT_SYMTAB) { - sec_done[sec_index] = true; - *sym_tab = data_anc.sec_data; - } - } - - return ret; -} - -static int bpf_fetch_prog_relo(Elf *elf_fd, GElf_Ehdr *elf_hdr, bool *sec_done, - enum bpf_prog_type type, const char *sec, - const char *license, Elf_Data *sym_tab) -{ - int sec_index, prog_fd = -1; - - for (sec_index = 1; sec_index < elf_hdr->e_shnum; sec_index++) { - struct bpf_elf_sec_data data_relo, data_insn; - int ins_index, ret; - - /* Attach eBPF programs with relocation data (maps). */ - ret = bpf_fill_section_data(elf_fd, elf_hdr, sec_index, - &data_relo); + for (i = 1; i < ctx->elf_hdr.e_shnum; i++) { + ret = bpf_fill_section_data(ctx, i, &data_relo); if (ret < 0 || data_relo.sec_hdr.sh_type != SHT_REL) continue; - ins_index = data_relo.sec_hdr.sh_info; - - ret = bpf_fill_section_data(elf_fd, elf_hdr, ins_index, - &data_insn); - if (ret < 0) - continue; - if (strcmp(data_insn.sec_name, sec)) + idx = data_relo.sec_hdr.sh_info; + ret = bpf_fill_section_data(ctx, idx, &data_insn); + if (ret < 0 || strcmp(data_insn.sec_name, section)) continue; - ret = bpf_apply_relo_data(&data_relo, &data_insn, sym_tab); + ret = bpf_apply_relo_data(ctx, &data_relo, &data_insn); if (ret < 0) continue; - prog_fd = bpf_prog_attach(type, sec, data_insn.sec_data->d_buf, - data_insn.sec_data->d_size, license); - if (prog_fd < 0) + memset(&prog, 0, sizeof(prog)); + prog.type = ctx->type; + prog.insns = data_insn.sec_data->d_buf; + prog.size = data_insn.sec_data->d_size; + prog.license = ctx->license; + + fd = bpf_prog_attach(section, &prog, ctx->verbose); + if (fd < 0) continue; - sec_done[sec_index] = true; - sec_done[ins_index] = true; + ctx->sec_done[i] = true; + ctx->sec_done[idx] = true; break; } - return prog_fd; + return fd; } -static int bpf_fetch_prog(Elf *elf_fd, GElf_Ehdr *elf_hdr, bool *sec_done, - enum bpf_prog_type type, const char *sec, - const char *license) -{ - int sec_index, prog_fd = -1; - - for (sec_index = 1; sec_index < elf_hdr->e_shnum; sec_index++) { - struct bpf_elf_sec_data data_insn; - int ret; - - /* Attach eBPF programs without relocation data. */ - if (sec_done[sec_index]) - continue; - - ret = bpf_fill_section_data(elf_fd, elf_hdr, sec_index, - &data_insn); - if (ret < 0) - continue; - if (strcmp(data_insn.sec_name, sec)) - continue; - - prog_fd = bpf_prog_attach(type, sec, data_insn.sec_data->d_buf, - data_insn.sec_data->d_size, license); - if (prog_fd < 0) - continue; - - sec_done[sec_index] = true; - break; - } - - return prog_fd; -} - -static int bpf_fetch_prog_sec(Elf *elf_fd, GElf_Ehdr *elf_hdr, bool *sec_done, - enum bpf_prog_type type, const char *sec, - const char *license, Elf_Data *sym_tab) +static int bpf_fetch_prog_sec(struct bpf_elf_ctx *ctx, const char *section) { int ret = -1; - if (sym_tab) - ret = bpf_fetch_prog_relo(elf_fd, elf_hdr, sec_done, type, - sec, license, sym_tab); + if (ctx->sym_tab) + ret = bpf_fetch_prog_relo(ctx, section); if (ret < 0) - ret = bpf_fetch_prog(elf_fd, elf_hdr, sec_done, type, sec, - license); + ret = bpf_fetch_prog(ctx, section); + return ret; } -static int bpf_fill_prog_arrays(Elf *elf_fd, GElf_Ehdr *elf_hdr, bool *sec_done, - enum bpf_prog_type type, const char *license, - Elf_Data *sym_tab) +static int bpf_fill_prog_arrays(struct bpf_elf_ctx *ctx) { - int sec_index; + struct bpf_elf_sec_data data; + uint32_t map_id, key_id; + int fd, i, ret; - for (sec_index = 1; sec_index < elf_hdr->e_shnum; sec_index++) { - struct bpf_elf_sec_data data_insn; - int ret, map_id, key_id, prog_fd; - - if (sec_done[sec_index]) + for (i = 1; i < ctx->elf_hdr.e_shnum; i++) { + if (ctx->sec_done[i]) continue; - ret = bpf_fill_section_data(elf_fd, elf_hdr, sec_index, - &data_insn); + ret = bpf_fill_section_data(ctx, i, &data); if (ret < 0) continue; - ret = sscanf(data_insn.sec_name, "%i/%i", &map_id, &key_id); - if (ret != 2) + ret = sscanf(data.sec_name, "%u/%u", &map_id, &key_id); + if (ret != 2 || map_id >= ARRAY_SIZE(ctx->map_fds) || + !ctx->map_fds[map_id]) + continue; + if (ctx->maps[map_id].type != BPF_MAP_TYPE_PROG_ARRAY || + ctx->maps[map_id].max_elem <= key_id) continue; - if (map_id >= ARRAY_SIZE(map_fds) || map_fds[map_id] < 0) - return -ENOENT; - if (map_ent[map_id].type != BPF_MAP_TYPE_PROG_ARRAY || - map_ent[map_id].max_elem <= key_id) - return -EINVAL; - - prog_fd = bpf_fetch_prog_sec(elf_fd, elf_hdr, sec_done, - type, data_insn.sec_name, - license, sym_tab); - if (prog_fd < 0) + fd = bpf_fetch_prog_sec(ctx, data.sec_name); + if (fd < 0) return -EIO; - ret = bpf_update_map(map_fds[map_id], &key_id, &prog_fd, - BPF_ANY); + ret = bpf_map_update(ctx->map_fds[map_id], &key_id, + &fd, BPF_NOEXIST); if (ret < 0) return -ENOENT; - sec_done[sec_index] = true; + ctx->sec_done[i] = true; } return 0; } -int bpf_open_object(const char *path, enum bpf_prog_type type, - const char *sec, bool verbose) +static void bpf_save_finfo(struct bpf_elf_ctx *ctx) { - char license[ELF_MAX_LICENSE_LEN]; - int file_fd, prog_fd = -1, ret; - Elf_Data *sym_tab = NULL; - GElf_Ehdr elf_hdr; - bool *sec_done; - Elf *elf_fd; + struct stat st; + int ret; - if (elf_version(EV_CURRENT) == EV_NONE) - return -EINVAL; + memset(&ctx->stat, 0, sizeof(ctx->stat)); - file_fd = open(path, O_RDONLY, 0); - if (file_fd < 0) - return -errno; - - elf_fd = elf_begin(file_fd, ELF_C_READ, NULL); - if (!elf_fd) { - ret = -EINVAL; - goto out; + ret = fstat(ctx->obj_fd, &st); + if (ret < 0) { + fprintf(stderr, "Stat of elf file failed: %s\n", + strerror(errno)); + return; } - if (gelf_getehdr(elf_fd, &elf_hdr) != &elf_hdr) { + ctx->stat.st_dev = st.st_dev; + ctx->stat.st_ino = st.st_ino; +} + +static int bpf_elf_ctx_init(struct bpf_elf_ctx *ctx, const char *pathname, + enum bpf_prog_type type, bool verbose) +{ + int ret = -EINVAL; + + if (elf_version(EV_CURRENT) == EV_NONE || + bpf_init_env(pathname)) + return ret; + + memset(ctx, 0, sizeof(*ctx)); + ctx->verbose = verbose; + ctx->type = type; + + ctx->obj_fd = open(pathname, O_RDONLY); + if (ctx->obj_fd < 0) + return ctx->obj_fd; + + ctx->elf_fd = elf_begin(ctx->obj_fd, ELF_C_READ, NULL); + if (!ctx->elf_fd) { + ret = -EINVAL; + goto out_fd; + } + + if (gelf_getehdr(ctx->elf_fd, &ctx->elf_hdr) != + &ctx->elf_hdr) { ret = -EIO; goto out_elf; } - sec_done = calloc(elf_hdr.e_shnum, sizeof(*sec_done)); - if (!sec_done) { + ctx->sec_done = calloc(ctx->elf_hdr.e_shnum, + sizeof(*(ctx->sec_done))); + if (!ctx->sec_done) { ret = -ENOMEM; goto out_elf; } - memset(license, 0, sizeof(license)); - bpf_verbose = verbose; + bpf_save_finfo(ctx); + return 0; +out_elf: + elf_end(ctx->elf_fd); +out_fd: + close(ctx->obj_fd); + return ret; +} - if (!bpf_may_skip_map_creation(file_fd)) - bpf_maps_init(); +static int bpf_maps_count(struct bpf_elf_ctx *ctx) +{ + int i, count = 0; - ret = bpf_fetch_ancillary(file_fd, elf_fd, &elf_hdr, sec_done, - license, sizeof(license), &sym_tab); - if (ret < 0) - goto out_maps; - - prog_fd = bpf_fetch_prog_sec(elf_fd, &elf_hdr, sec_done, type, - sec, license, sym_tab); - if (prog_fd < 0) - goto out_maps; - - if (!bpf_may_skip_map_creation(file_fd)) { - ret = bpf_fill_prog_arrays(elf_fd, &elf_hdr, sec_done, - type, license, sym_tab); - if (ret < 0) - goto out_prog; + for (i = 0; i < ARRAY_SIZE(ctx->map_fds); i++) { + if (!ctx->map_fds[i]) + break; + count++; } - bpf_save_finfo(file_fd); + return count; +} - free(sec_done); +static void bpf_maps_teardown(struct bpf_elf_ctx *ctx) +{ + int i; - elf_end(elf_fd); - close(file_fd); + for (i = 0; i < ARRAY_SIZE(ctx->map_fds); i++) { + if (ctx->map_fds[i]) + close(ctx->map_fds[i]); + } +} - return prog_fd; +static void bpf_elf_ctx_destroy(struct bpf_elf_ctx *ctx, bool failure) +{ + if (failure) + bpf_maps_teardown(ctx); -out_prog: - close(prog_fd); -out_maps: - bpf_maps_destroy(); - free(sec_done); -out_elf: - elf_end(elf_fd); + free(ctx->sec_done); + elf_end(ctx->elf_fd); + close(ctx->obj_fd); +} + +static struct bpf_elf_ctx __ctx; + +static int bpf_obj_open(const char *pathname, enum bpf_prog_type type, + const char *section, bool verbose) +{ + struct bpf_elf_ctx *ctx = &__ctx; + int fd = 0, ret; + + ret = bpf_elf_ctx_init(ctx, pathname, type, verbose); + if (ret < 0) { + fprintf(stderr, "Cannot initialize ELF context!\n"); + return ret; + } + + ret = bpf_fetch_ancillary(ctx); + if (ret < 0) { + fprintf(stderr, "Error fetching ELF ancillary data!\n"); + goto out; + } + + fd = bpf_fetch_prog_sec(ctx, section); + if (fd < 0) { + fprintf(stderr, "Error fetching program/map!\n"); + ret = fd; + goto out; + } + + ret = bpf_fill_prog_arrays(ctx); + if (ret < 0) + fprintf(stderr, "Error filling program arrays!\n"); out: - close(file_fd); - bpf_clear_finfo(); - return prog_fd; + bpf_elf_ctx_destroy(ctx, ret < 0); + if (ret < 0) { + if (fd) + close(fd); + return ret; + } + + return fd; } static int @@ -803,6 +1399,7 @@ bpf_map_set_recv(int fd, int *fds, struct bpf_map_aux *aux, int bpf_send_map_fds(const char *path, const char *obj) { + struct bpf_elf_ctx *ctx = &__ctx; struct sockaddr_un addr; struct bpf_map_data bpf_aux; int fd, ret; @@ -827,18 +1424,18 @@ int bpf_send_map_fds(const char *path, const char *obj) memset(&bpf_aux, 0, sizeof(bpf_aux)); - bpf_aux.fds = map_fds; - bpf_aux.ent = map_ent; - + bpf_aux.fds = ctx->map_fds; + bpf_aux.ent = ctx->maps; + bpf_aux.st = &ctx->stat; bpf_aux.obj = obj; - bpf_aux.st = &bpf_st; ret = bpf_map_set_send(fd, &addr, sizeof(addr), &bpf_aux, - bpf_maps_count()); + bpf_maps_count(ctx)); if (ret < 0) fprintf(stderr, "Cannot send fds to %s: %s\n", path, strerror(errno)); + bpf_maps_teardown(ctx); close(fd); return ret; } diff --git a/tc/tc_bpf.h b/tc/tc_bpf.h index 2ad88121..dea3c3bc 100644 --- a/tc/tc_bpf.h +++ b/tc/tc_bpf.h @@ -13,61 +13,56 @@ #ifndef _TC_BPF_H_ #define _TC_BPF_H_ 1 -#include #include -#include #include -#include -#include -#include -#include +#include #include "utils.h" #include "bpf_scm.h" +enum { + BPF_NLA_OPS_LEN = 0, + BPF_NLA_OPS, + BPF_NLA_FD, + BPF_NLA_NAME, + __BPF_NLA_MAX, +}; + +#define BPF_NLA_MAX __BPF_NLA_MAX + #define BPF_ENV_UDS "TC_BPF_UDS" +#define BPF_ENV_MNT "TC_BPF_MNT" +#define BPF_ENV_NOLOG "TC_BPF_NOLOG" -int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len, - char **bpf_string, bool *need_release, - const char separator); -int bpf_parse_ops(int argc, char **argv, struct sock_filter *bpf_ops, - bool from_file); -void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len); +#ifndef BPF_FS_MAGIC +# define BPF_FS_MAGIC 0xcafe4a11 +#endif +#define BPF_DIR_MNT "/sys/fs/bpf" + +#define BPF_DIR_TC "tc" +#define BPF_DIR_GLOBALS "globals" + +#ifndef TRACEFS_MAGIC +# define TRACEFS_MAGIC 0x74726163 +#endif + +#define TRACE_DIR_MNT "/sys/kernel/tracing" + +int bpf_trace_pipe(void); const char *bpf_default_section(const enum bpf_prog_type type); -#ifdef HAVE_ELF -int bpf_open_object(const char *path, enum bpf_prog_type type, - const char *sec, bool verbose); +int bpf_parse_common(int *ptr_argc, char ***ptr_argv, const int *nla_tbl, + enum bpf_prog_type type, const char **ptr_object, + const char **ptr_uds_name, struct nlmsghdr *n); +void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len); + +#ifdef HAVE_ELF int bpf_send_map_fds(const char *path, const char *obj); int bpf_recv_map_fds(const char *path, int *fds, struct bpf_map_aux *aux, unsigned int entries); - -static inline __u64 bpf_ptr_to_u64(const void *ptr) -{ - return (__u64) (unsigned long) ptr; -} - -static inline int bpf(int cmd, union bpf_attr *attr, unsigned int size) -{ -#ifdef __NR_bpf - return syscall(__NR_bpf, cmd, attr, size); #else - fprintf(stderr, "No bpf syscall, kernel headers too old?\n"); - errno = ENOSYS; - return -1; -#endif -} -#else -static inline int bpf_open_object(const char *path, enum bpf_prog_type type, - const char *sec, bool verbose) -{ - fprintf(stderr, "No ELF library support compiled in.\n"); - errno = ENOSYS; - return -1; -} - static inline int bpf_send_map_fds(const char *path, const char *obj) { return 0; From 6581df5ef361bdce6b0b0128e83517a84217965d Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Thu, 24 Sep 2015 14:39:39 -0400 Subject: [PATCH 078/151] geneve: add support for IPv6 link partners Signed-off-by: John W. Linville --- ip/iplink_geneve.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c index 331240a6..13454795 100644 --- a/ip/iplink_geneve.c +++ b/ip/iplink_geneve.c @@ -55,7 +55,7 @@ static int geneve_parse_opt(struct link_util *lu, int argc, char **argv, fprintf(stderr, "Invalid address \"%s\"\n", *argv); return -1; } - if (IN_MULTICAST(ntohl(daddr))) + if (IN6_IS_ADDR_MULTICAST(&daddr6) || IN_MULTICAST(ntohl(daddr))) invarg("invalid remote address", *argv); } else if (!matches(*argv, "ttl") || !matches(*argv, "hoplimit")) { @@ -96,18 +96,16 @@ static int geneve_parse_opt(struct link_util *lu, int argc, char **argv, return -1; } - if (!daddr) { - fprintf(stderr, "geneve: remove link partner not specified\n"); - return -1; - } - if (memcmp(&daddr6, &in6addr_any, sizeof(daddr6)) != 0) { - fprintf(stderr, "geneve: remove link over IPv6 not supported\n"); + if (!daddr && memcmp(&daddr6, &in6addr_any, sizeof(daddr6)) == 0) { + fprintf(stderr, "geneve: remote link partner not specified\n"); return -1; } addattr32(n, 1024, IFLA_GENEVE_ID, vni); if (daddr) addattr_l(n, 1024, IFLA_GENEVE_REMOTE, &daddr, 4); + if (memcmp(&daddr6, &in6addr_any, sizeof(daddr6)) != 0) + addattr_l(n, 1024, IFLA_GENEVE_REMOTE6, &daddr6, sizeof(struct in6_addr)); addattr8(n, 1024, IFLA_GENEVE_TTL, ttl); addattr8(n, 1024, IFLA_GENEVE_TOS, tos); @@ -135,6 +133,14 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) if (addr) fprintf(f, "remote %s ", format_host(AF_INET, 4, &addr, s1, sizeof(s1))); + } else if (tb[IFLA_GENEVE_REMOTE6]) { + struct in6_addr addr; + memcpy(&addr, RTA_DATA(tb[IFLA_GENEVE_REMOTE6]), sizeof(struct in6_addr)); + if (memcmp(&addr, &in6addr_any, sizeof(addr)) != 0) { + if (IN6_IS_ADDR_MULTICAST(&addr)) + fprintf(f, "remote %s ", + format_host(AF_INET6, sizeof(struct in6_addr), &addr, s1, sizeof(s1))); + } } if (tb[IFLA_GENEVE_TTL]) { From 906ac5437ab8e1e5f8514afe62bf01e1ff3906af Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Thu, 24 Sep 2015 14:39:39 -0400 Subject: [PATCH 079/151] geneve: add support for IPv6 link partners Signed-off-by: John W. Linville --- ip/iplink_geneve.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c index 331240a6..13454795 100644 --- a/ip/iplink_geneve.c +++ b/ip/iplink_geneve.c @@ -55,7 +55,7 @@ static int geneve_parse_opt(struct link_util *lu, int argc, char **argv, fprintf(stderr, "Invalid address \"%s\"\n", *argv); return -1; } - if (IN_MULTICAST(ntohl(daddr))) + if (IN6_IS_ADDR_MULTICAST(&daddr6) || IN_MULTICAST(ntohl(daddr))) invarg("invalid remote address", *argv); } else if (!matches(*argv, "ttl") || !matches(*argv, "hoplimit")) { @@ -96,18 +96,16 @@ static int geneve_parse_opt(struct link_util *lu, int argc, char **argv, return -1; } - if (!daddr) { - fprintf(stderr, "geneve: remove link partner not specified\n"); - return -1; - } - if (memcmp(&daddr6, &in6addr_any, sizeof(daddr6)) != 0) { - fprintf(stderr, "geneve: remove link over IPv6 not supported\n"); + if (!daddr && memcmp(&daddr6, &in6addr_any, sizeof(daddr6)) == 0) { + fprintf(stderr, "geneve: remote link partner not specified\n"); return -1; } addattr32(n, 1024, IFLA_GENEVE_ID, vni); if (daddr) addattr_l(n, 1024, IFLA_GENEVE_REMOTE, &daddr, 4); + if (memcmp(&daddr6, &in6addr_any, sizeof(daddr6)) != 0) + addattr_l(n, 1024, IFLA_GENEVE_REMOTE6, &daddr6, sizeof(struct in6_addr)); addattr8(n, 1024, IFLA_GENEVE_TTL, ttl); addattr8(n, 1024, IFLA_GENEVE_TOS, tos); @@ -135,6 +133,14 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) if (addr) fprintf(f, "remote %s ", format_host(AF_INET, 4, &addr, s1, sizeof(s1))); + } else if (tb[IFLA_GENEVE_REMOTE6]) { + struct in6_addr addr; + memcpy(&addr, RTA_DATA(tb[IFLA_GENEVE_REMOTE6]), sizeof(struct in6_addr)); + if (memcmp(&addr, &in6addr_any, sizeof(addr)) != 0) { + if (IN6_IS_ADDR_MULTICAST(&addr)) + fprintf(f, "remote %s ", + format_host(AF_INET6, sizeof(struct in6_addr), &addr, s1, sizeof(s1))); + } } if (tb[IFLA_GENEVE_TTL]) { From 13ada95da4f05df5993ca332358fa3f2f3a21047 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 24 Nov 2015 13:20:01 -0800 Subject: [PATCH 080/151] Add support for rt_tables.d Add support for reading table id/name mappings from rt_tables.d directory. Suggested-by: Roopa Prabhu Signed-off-by: David Ahern --- etc/iproute2/rt_tables.d/README | 3 +++ lib/rt_names.c | 26 ++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 etc/iproute2/rt_tables.d/README diff --git a/etc/iproute2/rt_tables.d/README b/etc/iproute2/rt_tables.d/README new file mode 100644 index 00000000..79386f89 --- /dev/null +++ b/etc/iproute2/rt_tables.d/README @@ -0,0 +1,3 @@ +Each file in this directory is an rt_tables configuration file. iproute2 +commands scan this directory processing all files that end in '.conf'. + diff --git a/lib/rt_names.c b/lib/rt_names.c index e87c65da..f68e91d6 100644 --- a/lib/rt_names.c +++ b/lib/rt_names.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -339,6 +340,8 @@ static int rtnl_rttable_init; static void rtnl_rttable_initialize(void) { + struct dirent *de; + DIR *d; int i; rtnl_rttable_init = 1; @@ -348,6 +351,29 @@ static void rtnl_rttable_initialize(void) } rtnl_hash_initialize(CONFDIR "/rt_tables", rtnl_rttable_hash, 256); + + d = opendir(CONFDIR "/rt_tables.d"); + if (!d) + return; + + while ((de = readdir(d)) != NULL) { + char path[PATH_MAX]; + size_t len; + + if (*de->d_name == '.') + continue; + + /* only consider filenames ending in '.conf' */ + len = strlen(de->d_name); + if (len <= 5) + continue; + if (strcmp(de->d_name + len - 5, ".conf")) + continue; + + snprintf(path, sizeof(path), CONFDIR "/rt_tables.d/%s", de->d_name); + rtnl_hash_initialize(path, rtnl_rttable_hash, 256); + } + closedir(d); } const char * rtnl_rttable_n2a(__u32 id, char *buf, int len) From 68ef50724914e17ef47871e432618d73cce0c6c9 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 29 Nov 2015 11:41:23 -0800 Subject: [PATCH 081/151] rt_names: style cleanup Cleanup all checkpatch complaints about whitespace in rt_names. --- lib/rt_names.c | 112 ++++++++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 53 deletions(-) diff --git a/lib/rt_names.c b/lib/rt_names.c index f68e91d6..1071a938 100644 --- a/lib/rt_names.c +++ b/lib/rt_names.c @@ -31,8 +31,8 @@ #define NAME_MAX_LEN 512 struct rtnl_hash_entry { - struct rtnl_hash_entry *next; - const char * name; + struct rtnl_hash_entry *next; + const char *name; unsigned int id; }; @@ -82,7 +82,7 @@ rtnl_hash_initialize(const char *file, struct rtnl_hash_entry **hash, int size) return; } - if (id<0) + if (id < 0) continue; entry = malloc(sizeof(*entry)); @@ -112,7 +112,7 @@ static void rtnl_tab_initialize(const char *file, char **tab, int size) fclose(fp); return; } - if (id<0 || id>size) + if (id < 0 || id > size) continue; tab[id] = strdup(namebuf); @@ -120,23 +120,23 @@ static void rtnl_tab_initialize(const char *file, char **tab, int size) fclose(fp); } -static char * rtnl_rtprot_tab[256] = { - [RTPROT_UNSPEC] = "none", - [RTPROT_REDIRECT] ="redirect", - [RTPROT_KERNEL] = "kernel", - [RTPROT_BOOT] = "boot", - [RTPROT_STATIC] = "static", +static char *rtnl_rtprot_tab[256] = { + [RTPROT_UNSPEC] = "none", + [RTPROT_REDIRECT] = "redirect", + [RTPROT_KERNEL] = "kernel", + [RTPROT_BOOT] = "boot", + [RTPROT_STATIC] = "static", - [RTPROT_GATED] = "gated", - [RTPROT_RA] = "ra", - [RTPROT_MRT] = "mrt", - [RTPROT_ZEBRA] ="zebra", - [RTPROT_BIRD] = "bird", - [RTPROT_BABEL] = "babel", + [RTPROT_GATED] = "gated", + [RTPROT_RA] = "ra", + [RTPROT_MRT] = "mrt", + [RTPROT_ZEBRA] = "zebra", + [RTPROT_BIRD] = "bird", + [RTPROT_BABEL] = "babel", [RTPROT_DNROUTED] = "dnrouted", - [RTPROT_XORP] = "xorp", - [RTPROT_NTK] = "ntk", - [RTPROT_DHCP] = "dhcp", + [RTPROT_XORP] = "xorp", + [RTPROT_NTK] = "ntk", + [RTPROT_DHCP] = "dhcp", }; @@ -149,9 +149,9 @@ static void rtnl_rtprot_initialize(void) rtnl_rtprot_tab, 256); } -const char * rtnl_rtprot_n2a(int id, char *buf, int len) +const char *rtnl_rtprot_n2a(int id, char *buf, int len) { - if (id<0 || id>=256) { + if (id < 0 || id >= 256) { snprintf(buf, len, "%u", id); return buf; } @@ -167,7 +167,7 @@ const char * rtnl_rtprot_n2a(int id, char *buf, int len) int rtnl_rtprot_a2n(__u32 *id, const char *arg) { - static char *cache = NULL; + static char *cache; static unsigned long res; char *end; int i; @@ -180,7 +180,7 @@ int rtnl_rtprot_a2n(__u32 *id, const char *arg) if (!rtnl_rtprot_init) rtnl_rtprot_initialize(); - for (i=0; i<256; i++) { + for (i = 0; i < 256; i++) { if (rtnl_rtprot_tab[i] && strcmp(rtnl_rtprot_tab[i], arg) == 0) { cache = rtnl_rtprot_tab[i]; @@ -197,8 +197,13 @@ int rtnl_rtprot_a2n(__u32 *id, const char *arg) return 0; } -static char * rtnl_rtscope_tab[256] = { - "global", + +static char *rtnl_rtscope_tab[256] = { + [RT_SCOPE_UNIVERSE] = "global", + [RT_SCOPE_NOWHERE] = "nowhere", + [RT_SCOPE_HOST] = "host", + [RT_SCOPE_LINK] = "link", + [RT_SCOPE_SITE] = "site", }; static int rtnl_rtscope_init; @@ -206,33 +211,32 @@ static int rtnl_rtscope_init; static void rtnl_rtscope_initialize(void) { rtnl_rtscope_init = 1; - rtnl_rtscope_tab[RT_SCOPE_NOWHERE] = "nowhere"; - rtnl_rtscope_tab[RT_SCOPE_HOST] = "host"; - rtnl_rtscope_tab[RT_SCOPE_LINK] = "link"; - rtnl_rtscope_tab[RT_SCOPE_SITE] = "site"; rtnl_tab_initialize(CONFDIR "/rt_scopes", rtnl_rtscope_tab, 256); } const char *rtnl_rtscope_n2a(int id, char *buf, int len) { - if (id<0 || id>=256) { + if (id < 0 || id >= 256) { snprintf(buf, len, "%d", id); return buf; } + if (!rtnl_rtscope_tab[id]) { if (!rtnl_rtscope_init) rtnl_rtscope_initialize(); } + if (rtnl_rtscope_tab[id]) return rtnl_rtscope_tab[id]; + snprintf(buf, len, "%d", id); return buf; } int rtnl_rtscope_a2n(__u32 *id, const char *arg) { - static const char *cache = NULL; + static const char *cache; static unsigned long res; char *end; int i; @@ -245,7 +249,7 @@ int rtnl_rtscope_a2n(__u32 *id, const char *arg) if (!rtnl_rtscope_init) rtnl_rtscope_initialize(); - for (i=0; i<256; i++) { + for (i = 0; i < 256; i++) { if (rtnl_rtscope_tab[i] && strcmp(rtnl_rtscope_tab[i], arg) == 0) { cache = rtnl_rtscope_tab[i]; @@ -263,7 +267,7 @@ int rtnl_rtscope_a2n(__u32 *id, const char *arg) } -static char * rtnl_rtrealm_tab[256] = { +static char *rtnl_rtrealm_tab[256] = { "unknown", }; @@ -278,7 +282,7 @@ static void rtnl_rtrealm_initialize(void) const char *rtnl_rtrealm_n2a(int id, char *buf, int len) { - if (id<0 || id>=256) { + if (id < 0 || id >= 256) { snprintf(buf, len, "%d", id); return buf; } @@ -295,7 +299,7 @@ const char *rtnl_rtrealm_n2a(int id, char *buf, int len) int rtnl_rtrealm_a2n(__u32 *id, const char *arg) { - static char *cache = NULL; + static char *cache; static unsigned long res; char *end; int i; @@ -308,7 +312,7 @@ int rtnl_rtrealm_a2n(__u32 *id, const char *arg) if (!rtnl_rtrealm_init) rtnl_rtrealm_initialize(); - for (i=0; i<256; i++) { + for (i = 0; i < 256; i++) { if (rtnl_rtrealm_tab[i] && strcmp(rtnl_rtrealm_tab[i], arg) == 0) { cache = rtnl_rtrealm_tab[i]; @@ -330,7 +334,7 @@ static struct rtnl_hash_entry dflt_table_entry = { .name = "default" }; static struct rtnl_hash_entry main_table_entry = { .name = "main" }; static struct rtnl_hash_entry local_table_entry = { .name = "local" }; -static struct rtnl_hash_entry * rtnl_rttable_hash[256] = { +static struct rtnl_hash_entry *rtnl_rttable_hash[256] = { [RT_TABLE_DEFAULT] = &dflt_table_entry, [RT_TABLE_MAIN] = &main_table_entry, [RT_TABLE_LOCAL] = &local_table_entry, @@ -370,13 +374,14 @@ static void rtnl_rttable_initialize(void) if (strcmp(de->d_name + len - 5, ".conf")) continue; - snprintf(path, sizeof(path), CONFDIR "/rt_tables.d/%s", de->d_name); + snprintf(path, sizeof(path), + CONFDIR "/rt_tables.d/%s", de->d_name); rtnl_hash_initialize(path, rtnl_rttable_hash, 256); } closedir(d); } -const char * rtnl_rttable_n2a(__u32 id, char *buf, int len) +const char *rtnl_rttable_n2a(__u32 id, char *buf, int len) { struct rtnl_hash_entry *entry; @@ -397,7 +402,7 @@ const char * rtnl_rttable_n2a(__u32 id, char *buf, int len) int rtnl_rttable_a2n(__u32 *id, const char *arg) { - static const char *cache = NULL; + static const char *cache; static unsigned long res; struct rtnl_hash_entry *entry; char *end; @@ -411,7 +416,7 @@ int rtnl_rttable_a2n(__u32 *id, const char *arg) if (!rtnl_rttable_init) rtnl_rttable_initialize(); - for (i=0; i<256; i++) { + for (i = 0; i < 256; i++) { entry = rtnl_rttable_hash[i]; while (entry && strcmp(entry->name, arg)) entry = entry->next; @@ -431,7 +436,7 @@ int rtnl_rttable_a2n(__u32 *id, const char *arg) } -static char * rtnl_rtdsfield_tab[256] = { +static char *rtnl_rtdsfield_tab[256] = { "0", }; @@ -446,7 +451,7 @@ static void rtnl_rtdsfield_initialize(void) const char *rtnl_dsfield_n2a(int id, char *buf, int len) { - if (id<0 || id>=256) { + if (id < 0 || id >= 256) { snprintf(buf, len, "%d", id); return buf; } @@ -463,7 +468,7 @@ const char *rtnl_dsfield_n2a(int id, char *buf, int len) int rtnl_dsfield_a2n(__u32 *id, const char *arg) { - static char *cache = NULL; + static char *cache; static unsigned long res; char *end; int i; @@ -476,7 +481,7 @@ int rtnl_dsfield_a2n(__u32 *id, const char *arg) if (!rtnl_rtdsfield_init) rtnl_rtdsfield_initialize(); - for (i=0; i<256; i++) { + for (i = 0; i < 256; i++) { if (rtnl_rtdsfield_tab[i] && strcmp(rtnl_rtdsfield_tab[i], arg) == 0) { cache = rtnl_rtdsfield_tab[i]; @@ -494,9 +499,11 @@ int rtnl_dsfield_a2n(__u32 *id, const char *arg) } -static struct rtnl_hash_entry dflt_group_entry = { .id = 0, .name = "default" }; +static struct rtnl_hash_entry dflt_group_entry = { + .id = 0, .name = "default" +}; -static struct rtnl_hash_entry * rtnl_group_hash[256] = { +static struct rtnl_hash_entry *rtnl_group_hash[256] = { [0] = &dflt_group_entry, }; @@ -511,7 +518,7 @@ static void rtnl_group_initialize(void) int rtnl_group_a2n(int *id, const char *arg) { - static const char *cache = NULL; + static const char *cache; static unsigned long res; struct rtnl_hash_entry *entry; char *end; @@ -525,7 +532,7 @@ int rtnl_group_a2n(int *id, const char *arg) if (!rtnl_group_init) rtnl_group_initialize(); - for (i=0; i<256; i++) { + for (i = 0; i < 256; i++) { entry = rtnl_group_hash[i]; while (entry && strcmp(entry->name, arg)) entry = entry->next; @@ -552,11 +559,10 @@ const char *rtnl_group_n2a(int id, char *buf, int len) if (!rtnl_group_init) rtnl_group_initialize(); - for (i=0; i<256; i++) { + for (i = 0; i < 256; i++) { entry = rtnl_group_hash[i]; - if (entry && entry->id == id) { + if (entry && entry->id == id) return entry->name; - } } snprintf(buf, len, "%d", id); @@ -615,7 +621,7 @@ const char *nl_proto_n2a(int id, char *buf, int len) int nl_proto_a2n(__u32 *id, const char *arg) { - static char *cache = NULL; + static char *cache; static unsigned long res; char *end; int i; From c6995c48025233902a5b0c5fe88654e17ea934f6 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 24 Nov 2015 15:31:00 +0100 Subject: [PATCH 082/151] ipaddress: simplify ipaddr_flush() Since it's no longer relevant whether an IP address is primary or secondary when flushing, ipaddr_flush() can be simplified a bit. Signed-off-by: Phil Sutter --- ip/ipaddress.c | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 05358c97..26e91c9b 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -1148,28 +1148,6 @@ brief_exit: return 0; } -static int print_addrinfo_primary(const struct sockaddr_nl *who, - struct nlmsghdr *n, void *arg) -{ - struct ifaddrmsg *ifa = NLMSG_DATA(n); - - if (ifa->ifa_flags & IFA_F_SECONDARY) - return 0; - - return print_addrinfo(who, n, arg); -} - -static int print_addrinfo_secondary(const struct sockaddr_nl *who, - struct nlmsghdr *n, void *arg) -{ - struct ifaddrmsg *ifa = NLMSG_DATA(n); - - if (!(ifa->ifa_flags & IFA_F_SECONDARY)) - return 0; - - return print_addrinfo(who, n, arg); -} - struct nlmsg_list { struct nlmsg_list *next; @@ -1420,26 +1398,12 @@ static int ipaddr_flush(void) filter.flushe = sizeof(flushb); while ((max_flush_loops == 0) || (round < max_flush_loops)) { - const struct rtnl_dump_filter_arg a[3] = { - { - .filter = print_addrinfo_secondary, - .arg1 = stdout, - }, - { - .filter = print_addrinfo_primary, - .arg1 = stdout, - }, - { - .filter = NULL, - .arg1 = NULL, - }, - }; if (rtnl_wilddump_request(&rth, filter.family, RTM_GETADDR) < 0) { perror("Cannot send dump request"); exit(1); } filter.flushed = 0; - if (rtnl_dump_filter_l(&rth, a) < 0) { + if (rtnl_dump_filter(&rth, print_addrinfo, stdout) < 0) { fprintf(stderr, "Flush terminated\n"); exit(1); } From 8e72880f6bfa39f439b9c4a88eb84b635b991687 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 24 Nov 2015 15:31:01 +0100 Subject: [PATCH 083/151] libnetlink: introduce nc_flags Allow for a filter to ignore certain nlmsg_flags. Signed-off-by: Phil Sutter --- include/libnetlink.h | 7 ++++++- lib/libnetlink.c | 10 ++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/include/libnetlink.h b/include/libnetlink.h index 2280c39c..431189e2 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -60,11 +60,16 @@ struct rtnl_dump_filter_arg { rtnl_filter_t filter; void *arg1; + __u16 nc_flags; }; int rtnl_dump_filter_l(struct rtnl_handle *rth, const struct rtnl_dump_filter_arg *arg); -int rtnl_dump_filter(struct rtnl_handle *rth, rtnl_filter_t filter, void *arg); +int rtnl_dump_filter_nc(struct rtnl_handle *rth, + rtnl_filter_t filter, + void *arg, __u16 nc_flags); +#define rtnl_dump_filter(rth, filter, arg) \ + rtnl_dump_filter_nc(rth, filter, arg, 0) int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, struct nlmsghdr *answer, size_t len) __attribute__((warn_unused_result)); diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 09b0e911..922ec2d9 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -259,6 +259,8 @@ int rtnl_dump_filter_l(struct rtnl_handle *rth, while (NLMSG_OK(h, msglen)) { int err = 0; + h->nlmsg_flags &= ~a->nc_flags; + if (nladdr.nl_pid != 0 || h->nlmsg_pid != rth->local.nl_pid || h->nlmsg_seq != rth->dump) @@ -317,13 +319,13 @@ skip_it: } } -int rtnl_dump_filter(struct rtnl_handle *rth, +int rtnl_dump_filter_nc(struct rtnl_handle *rth, rtnl_filter_t filter, - void *arg1) + void *arg1, __u16 nc_flags) { const struct rtnl_dump_filter_arg a[2] = { - { .filter = filter, .arg1 = arg1, }, - { .filter = NULL, .arg1 = NULL, }, + { .filter = filter, .arg1 = arg1, .nc_flags = nc_flags, }, + { .filter = NULL, .arg1 = NULL, .nc_flags = 0, }, }; return rtnl_dump_filter_l(rth, a); From d25ec03e1dce4cf22093a9f7106e9401ab5bf066 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 24 Nov 2015 15:31:02 +0100 Subject: [PATCH 084/151] ipaddress: fix ipaddr_flush for Linux >= 3.1 Linux version 3.1 introduced a consistency check for netlink dumps in commit 670dc28 ("netlink: advertise incomplete dumps"). This bites iproute2 when flushing more addresses than can fit into a single RTM_GETADDR response. To silence the spurious error message "Dump was interrupted and may be inconsistent.", advise rtnl_dump_filter_l() to not care about NLM_F_DUMP_INTR. Signed-off-by: Phil Sutter --- ip/ipaddress.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 26e91c9b..9811eb4c 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -1403,7 +1403,8 @@ static int ipaddr_flush(void) exit(1); } filter.flushed = 0; - if (rtnl_dump_filter(&rth, print_addrinfo, stdout) < 0) { + if (rtnl_dump_filter_nc(&rth, print_addrinfo, + stdout, NLM_F_DUMP_INTR) < 0) { fprintf(stderr, "Flush terminated\n"); exit(1); } From 906dfe4887672be87b0656a2034f950883f036f6 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 24 Nov 2015 15:31:03 +0100 Subject: [PATCH 085/151] ipaddress: drop unnecessary check in ipaddr_list_flush_or_save() Right after ipaddr_reset_filter(), filter.family is always AF_UNSPEC. Signed-off-by: Phil Sutter --- ip/ipaddress.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 9811eb4c..bc8359eb 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -1451,10 +1451,7 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action) ipaddr_reset_filter(oneline, 0); filter.showqueue = 1; - - if (filter.family == AF_UNSPEC) - filter.family = preferred_family; - + filter.family = preferred_family; filter.group = -1; if (action == IPADD_FLUSH) { From d81f54d5999cb78f4d062a21693ddd50165df0ec Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 24 Nov 2015 15:31:04 +0100 Subject: [PATCH 086/151] iptoken: simplify iptoken_list a bit Since it uses only a single filter, rtnl_dump_filter() can be used. Signed-off-by: Phil Sutter --- ip/iptoken.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ip/iptoken.c b/ip/iptoken.c index a38194c9..428f1332 100644 --- a/ip/iptoken.c +++ b/ip/iptoken.c @@ -95,10 +95,6 @@ static int iptoken_list(int argc, char **argv) { int af = AF_INET6; struct rtnl_dump_args da; - const struct rtnl_dump_filter_arg a[2] = { - { .filter = print_token, .arg1 = &da, }, - { .filter = NULL, .arg1 = NULL, }, - }; memset(&da, 0, sizeof(da)); da.fp = stdout; @@ -118,7 +114,7 @@ static int iptoken_list(int argc, char **argv) return -1; } - if (rtnl_dump_filter_l(&rth, a) < 0) { + if (rtnl_dump_filter(&rth, print_token, &da) < 0) { fprintf(stderr, "Dump terminated\n"); return -1; } From ea6cbab792f7bb8813f1b24cc1f4bd4caad8ccbe Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 24 Nov 2015 15:45:31 +0100 Subject: [PATCH 087/151] iproute: restrict hoplimit values to be in range [0; 255] Technically, the range of possible hoplimit values are defined by IPv4 and IPv6 header formats. Both define the field to be eight bits in size, which leads to a value range of [0;255]. Setting a packet's hoplimit field to 0 though makes not much sense, as the next hop would immediately drop the packet. Therefore Linux uses 0 as a special value indicating to use the system's default hoplimit (configurable via sysctl). In iproute, setting the hoplimit of a route to 0 is equivalent to omitting the hoplimit parameter alltogether, so it is actually not necessary to allow that value to be specified, but keep it anyway for backwards compatibility. Signed-off-by: Phil Sutter --- ip/iproute.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/iproute.c b/ip/iproute.c index c0ef7bfe..aed1038e 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -931,7 +931,7 @@ static int iproute_modify(int cmd, unsigned flags, int argc, char **argv) mxlock |= (1< 255) invarg("\"hoplimit\" value is invalid\n", *argv); rta_addattr32(mxrta, sizeof(mxbuf), RTAX_HOPLIMIT, hoplimit); } else if (strcmp(*argv, "advmss") == 0) { From fc31817d1f2ee8098b875a8b122f136a7564e339 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 24 Nov 2015 15:50:00 +0100 Subject: [PATCH 088/151] bridge.8: minor formatting cleanup - Replace commas at end of subsection with dots. - Replace double whitespace by single one. Signed-off-by: Phil Sutter --- man/man8/bridge.8 | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index 98a92eb8..0ec6f174 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -25,14 +25,14 @@ bridge \- show / manipulate bridge addresses and devices .ti -8 .BR "bridge link set" -.B dev +.B dev .IR DEV .IR " [ " -.B cost +.B cost .IR COST " ] [ " -.B priority -.IR PRIO " ] [ " -.B state +.B priority +.IR PRIO " ] [ " +.B state .IR STATE "] [" .BR guard " { " on " | " off " } ] [ " .BR hairpin " { " on " | " off " } ] [ " @@ -42,21 +42,21 @@ bridge \- show / manipulate bridge addresses and devices .BR learning_sync " { " on " | " off " } ] [ " .BR flood " { " on " | " off " } ] [ " .BR hwmode " { " vepa " | " veb " } ] [ " -.BR self " ] [ " master " ] " +.BR self " ] [ " master " ] " .ti -8 .BR "bridge link" " [ " show " ] [ " -.B dev +.B dev .IR DEV " ]" .ti -8 .BR "bridge fdb" " { " add " | " append " | " del " | " replace " } " .I LLADDR -.B dev +.B dev .IR DEV " { " .BR local " | " temp " } [ " .BR self " ] [ " master " ] [ " router " ] [ " use " ] [ " -.B dst +.B dst .IR IPADDR " ] [ " .B vni .IR VNI " ] [" @@ -67,12 +67,12 @@ bridge \- show / manipulate bridge addresses and devices .ti -8 .BR "bridge fdb" " [ " show " ] [ " -.B dev +.B dev .IR DEV " ]" .ti -8 .BR "bridge mdb" " { " add " | " del " } " -.B dev +.B dev .IR DEV .B port .IR PORT @@ -84,21 +84,21 @@ bridge \- show / manipulate bridge addresses and devices .ti -8 .BR "bridge mdb show " [ " -.B dev +.B dev .IR DEV " ]" .ti -8 .BR "bridge vlan" " { " add " | " del " } " -.B dev +.B dev .IR DEV -.B vid +.B vid .IR VID " [ " -.BR pvid " ] [ " untagged " ] [ " -.BR self " ] [ " master " ] " +.BR pvid " ] [ " untagged " ] [ " +.BR self " ] [ " master " ] " .ti -8 .BR "bridge vlan" " [ " show " ] [ " -.B dev +.B dev .IR DEV " ]" .ti -8 @@ -319,7 +319,7 @@ This command displays the current bridge port configuration and flags. .SH bridge fdb - forwarding database management .B fdb -objects contain known Ethernet addresses on a link. +objects contain known Ethernet addresses on a link. .P The corresponding commands display fdb entries, add new entries, @@ -398,21 +398,21 @@ sends a copy of the data packet to each entry found. .PP The arguments are the same as with -.BR "bridge fdb add" , +.BR "bridge fdb add" . .SS bridge fdb delete - delete a forwarding database entry This command removes an existing fdb entry. .PP The arguments are the same as with -.BR "bridge fdb add" , +.BR "bridge fdb add" . .SS bridge fdb replace - replace a forwarding database entry If no matching entry is found, a new one will be created instead. .PP The arguments are the same as with -.BR "bridge fdb add" , +.BR "bridge fdb add" . .SS bridge fdb show - list forwarding entries. @@ -548,7 +548,7 @@ This command displays the current VLAN filter table. The .B bridge -utility can monitor the state of devices and addresses +utility can monitor the state of devices and addresses continuously. This option has a slightly different format. Namely, the .B monitor @@ -560,7 +560,7 @@ command is the first in the command line and then the object list follows: .I OBJECT-LIST is the list of object types that we want to monitor. It may contain -.BR link ", " fdb ", and " mdb "." +.BR link ", " fdb ", and " mdb "." If no .B file argument is given, From db3ef44c5433aa50bae6f88736ad350fb14fc2cf Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Sat, 28 Nov 2015 01:00:01 +0100 Subject: [PATCH 089/151] lnstat: review lnstat_update() Instead of calling rewind() and fgets() before every call to scan_lines(), move them into scan_lines() itself. This should also fix compat mode, as before the second call to scan_lines() the first line was skipped unconditionally. Signed-off-by: Phil Sutter --- misc/lnstat_util.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/misc/lnstat_util.c b/misc/lnstat_util.c index 6dde7c49..433e9929 100644 --- a/misc/lnstat_util.c +++ b/misc/lnstat_util.c @@ -38,18 +38,22 @@ /* Read (and summarize for SMP) the different stats vars. */ static int scan_lines(struct lnstat_file *lf, int i) { + char buf[FGETS_BUF_SIZE]; int j, num_lines = 0; for (j = 0; j < lf->num_fields; j++) lf->fields[j].values[i] = 0; - while(!feof(lf->fp)) { - char buf[FGETS_BUF_SIZE]; + rewind(lf->fp); + /* skip first line */ + if (!lf->compat && !fgets(buf, sizeof(buf)-1, lf->fp)) + return -1; + + while(!feof(lf->fp) && fgets(buf, sizeof(buf)-1, lf->fp)) { char *ptr = buf; num_lines++; - fgets(buf, sizeof(buf)-1, lf->fp); gettimeofday(&lf->last_read, NULL); for (j = 0; j < lf->num_fields; j++) { @@ -81,7 +85,6 @@ static int time_after(struct timeval *last, int lnstat_update(struct lnstat_file *lnstat_files) { struct lnstat_file *lf; - char buf[FGETS_BUF_SIZE]; struct timeval tv; gettimeofday(&tv, NULL); @@ -91,11 +94,6 @@ int lnstat_update(struct lnstat_file *lnstat_files) int i; struct lnstat_field *lfi; - rewind(lf->fp); - if (!lf->compat) { - /* skip first line */ - fgets(buf, sizeof(buf)-1, lf->fp); - } scan_lines(lf, 1); for (i = 0, lfi = &lf->fields[i]; @@ -107,8 +105,6 @@ int lnstat_update(struct lnstat_file *lnstat_files) / lf->interval.tv_sec; } - rewind(lf->fp); - fgets(buf, sizeof(buf)-1, lf->fp); scan_lines(lf, 0); } } From 596307ea3d638ea037017b0d6f31f2d04fddd296 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Sat, 28 Nov 2015 01:00:02 +0100 Subject: [PATCH 090/151] ss: reduce max indentation level in init_service_resolver() Exit early or continue on error instead of putting conditional into conditional to make reading the code a bit easier. Also, the call to memcpy() can be skipped by initialising prog with the desired prefix. Signed-off-by: Phil Sutter --- misc/ss.c | 53 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/misc/ss.c b/misc/ss.c index a9ae85ec..4988d34e 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -870,31 +870,38 @@ static void init_service_resolver(void) { char buf[128]; FILE *fp = popen("/usr/sbin/rpcinfo -p 2>/dev/null", "r"); - if (fp) { - fgets(buf, sizeof(buf), fp); - while (fgets(buf, sizeof(buf), fp) != NULL) { - unsigned int progn, port; - char proto[128], prog[128]; - if (sscanf(buf, "%u %*d %s %u %s", &progn, proto, - &port, prog+4) == 4) { - struct scache *c = malloc(sizeof(*c)); - if (c) { - c->port = port; - memcpy(prog, "rpc.", 4); - c->name = strdup(prog); - if (strcmp(proto, TCP_PROTO) == 0) - c->proto = TCP_PROTO; - else if (strcmp(proto, UDP_PROTO) == 0) - c->proto = UDP_PROTO; - else - c->proto = NULL; - c->next = rlist; - rlist = c; - } - } - } + + if (!fp) + return; + + if (!fgets(buf, sizeof(buf), fp)) { pclose(fp); + return; } + while (fgets(buf, sizeof(buf), fp) != NULL) { + unsigned int progn, port; + char proto[128], prog[128] = "rpc."; + struct scache *c; + + if (sscanf(buf, "%u %*d %s %u %s", + &progn, proto, &port, prog+4) != 4) + continue; + + if (!(c = malloc(sizeof(*c)))) + continue; + + c->port = port; + c->name = strdup(prog); + if (strcmp(proto, TCP_PROTO) == 0) + c->proto = TCP_PROTO; + else if (strcmp(proto, UDP_PROTO) == 0) + c->proto = UDP_PROTO; + else + c->proto = NULL; + c->next = rlist; + rlist = c; + } + pclose(fp); } static int ip_local_port_min, ip_local_port_max; From c29d37925aa479a6c0545fee87329ab1d248680a Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Sat, 28 Nov 2015 01:00:03 +0100 Subject: [PATCH 091/151] ss: review is_ephemeral() No need to keep static port boundaries global, they are not used directly. Keeping them local also allows to safely reduce their names to the minimum. Assign hardcoded fallback values also if fscanf() fails. Get rid of unnecessary braces around return parameter. Instead of more or less duplicating is_ephemeral() in run_ssfilter(), simply call the function instead. Signed-off-by: Phil Sutter --- misc/ss.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/misc/ss.c b/misc/ss.c index 4988d34e..79285732 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -904,8 +904,6 @@ static void init_service_resolver(void) pclose(fp); } -static int ip_local_port_min, ip_local_port_max; - /* Even do not try default linux ephemeral port ranges: * default /etc/services contains so much of useless crap * wouldbe "allocated" to this area that resolution @@ -914,19 +912,18 @@ static int ip_local_port_min, ip_local_port_max; */ static int is_ephemeral(int port) { - if (!ip_local_port_min) { - FILE *f = ephemeral_ports_open(); - if (f) { - fscanf(f, "%d %d", - &ip_local_port_min, &ip_local_port_max); - fclose(f); - } else { - ip_local_port_min = 1024; - ip_local_port_max = 4999; - } - } + static int min = 0, max = 0; - return (port >= ip_local_port_min && port<= ip_local_port_max); + if (!min) { + FILE *f = ephemeral_ports_open(); + if (!f || fscanf(f, "%d %d", &min, &max) < 2) { + min = 1024; + max = 4999; + } + if (f) + fclose(f); + } + return port >= min && port <= max; } @@ -1081,8 +1078,6 @@ static int run_ssfilter(struct ssfilter *f, struct sockstat *s) switch (f->type) { case SSF_S_AUTO: { - static int low, high=65535; - if (s->local.family == AF_UNIX) { char *p; memcpy(&p, s->local.data, sizeof(p)); @@ -1094,14 +1089,7 @@ static int run_ssfilter(struct ssfilter *f, struct sockstat *s) if (s->local.family == AF_NETLINK) return s->lport < 0; - if (!low) { - FILE *fp = ephemeral_ports_open(); - if (fp) { - fscanf(fp, "%d%d", &low, &high); - fclose(fp); - } - } - return s->lport >= low && s->lport <= high; + return is_ephemeral(s->lport); } case SSF_DCOND: { From d572ed4d0af79eb597469d3f1a84456782c64f24 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Sat, 28 Nov 2015 01:00:04 +0100 Subject: [PATCH 092/151] get rid of remaining -Wunused-result warnings Although not fundamentally necessary to check return codes in these spots, preventing the warnings will put new ones into focus. Signed-off-by: Phil Sutter --- misc/ifstat.c | 6 ++++-- misc/lnstat_util.c | 3 ++- misc/nstat.c | 6 ++++-- misc/ss.c | 18 ++++++++++++++---- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/misc/ifstat.c b/misc/ifstat.c index 20a8db45..ac5c29c8 100644 --- a/misc/ifstat.c +++ b/misc/ifstat.c @@ -819,7 +819,8 @@ int main(int argc, char *argv[]) } if (uptime >= 0 && time(NULL) >= stb.st_mtime+uptime) { fprintf(stderr, "ifstat: history is aged out, resetting\n"); - ftruncate(fileno(hist_fp), 0); + if (ftruncate(fileno(hist_fp), 0)) + perror("ifstat: ftruncate"); } } @@ -862,7 +863,8 @@ int main(int argc, char *argv[]) } if (!no_update) { - ftruncate(fileno(hist_fp), 0); + if (ftruncate(fileno(hist_fp), 0)) + perror("ifstat: ftruncate"); rewind(hist_fp); json_output = 0; diff --git a/misc/lnstat_util.c b/misc/lnstat_util.c index 433e9929..70a77c56 100644 --- a/misc/lnstat_util.c +++ b/misc/lnstat_util.c @@ -138,7 +138,8 @@ static int lnstat_scan_fields(struct lnstat_file *lf) char buf[FGETS_BUF_SIZE]; rewind(lf->fp); - fgets(buf, sizeof(buf)-1, lf->fp); + if (!fgets(buf, sizeof(buf)-1, lf->fp)) + return -1; return __lnstat_scan_fields(lf, buf); } diff --git a/misc/nstat.c b/misc/nstat.c index 267e515f..99705286 100644 --- a/misc/nstat.c +++ b/misc/nstat.c @@ -649,7 +649,8 @@ int main(int argc, char *argv[]) } if (uptime >= 0 && time(NULL) >= stb.st_mtime+uptime) { fprintf(stderr, "nstat: history is aged out, resetting\n"); - ftruncate(fileno(hist_fp), 0); + if (ftruncate(fileno(hist_fp), 0) < 0) + perror("nstat: ftruncate"); } } @@ -693,7 +694,8 @@ int main(int argc, char *argv[]) dump_incr_db(stdout); } if (!no_update) { - ftruncate(fileno(hist_fp), 0); + if (ftruncate(fileno(hist_fp), 0) < 0) + perror("nstat: ftruncate"); rewind(hist_fp); json_output = 0; diff --git a/misc/ss.c b/misc/ss.c index 79285732..d5090094 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -528,7 +528,8 @@ static void user_ent_hash_build(void) snprintf(tmp, sizeof(tmp), "%s/%d/stat", root, pid); if ((fp = fopen(tmp, "r")) != NULL) { - fscanf(fp, "%*d (%[^)])", p); + if (fscanf(fp, "%*d (%[^)])", p) < 1) + ; /* ignore */ fclose(fp); } } @@ -660,7 +661,10 @@ static int get_slabstat(struct slabstat *s) cnt = sizeof(*s)/sizeof(int); - fgets(buf, sizeof(buf), fp); + if (!fgets(buf, sizeof(buf), fp)) { + fclose(fp); + return -1; + } while(fgets(buf, sizeof(buf), fp) != NULL) { int i; for (i=0; i Date: Sat, 28 Nov 2015 01:00:05 +0100 Subject: [PATCH 093/151] get rid of unnecessary fgets() buffer size limitation fgets() will read at most size-1 bytes into the buffer and add a terminating null-char at the end. Therefore it is not necessary to pass a reduced buffer size when calling it. This change was generated using the following semantic patch: @@ identifier buf, fp; @@ - fgets(buf, sizeof(buf) - 1, fp) + fgets(buf, sizeof(buf), fp) Signed-off-by: Phil Sutter --- misc/arpd.c | 2 +- misc/ss.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/misc/arpd.c b/misc/arpd.c index 7919eb8b..6bb9bd16 100644 --- a/misc/arpd.c +++ b/misc/arpd.c @@ -703,7 +703,7 @@ int main(int argc, char **argv) } buf[sizeof(buf)-1] = 0; - while (fgets(buf, sizeof(buf)-1, fp)) { + while (fgets(buf, sizeof(buf), fp)) { __u8 b1[6]; char ipbuf[128]; char macbuf[128]; diff --git a/misc/ss.c b/misc/ss.c index d5090094..0dab32ce 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -2729,7 +2729,7 @@ static int unix_show(struct filter *f) if ((fp = net_unix_open()) == NULL) return -1; - if (!fgets(buf, sizeof(buf)-1, fp)) { + if (!fgets(buf, sizeof(buf), fp)) { fclose(fp); return -1; } @@ -2738,7 +2738,7 @@ static int unix_show(struct filter *f) newformat = 1; cnt = 0; - while (fgets(buf, sizeof(buf)-1, fp)) { + while (fgets(buf, sizeof(buf), fp)) { struct sockstat *u, **insp; int flags; @@ -3217,12 +3217,12 @@ static int netlink_show(struct filter *f) if ((fp = net_netlink_open()) == NULL) return -1; - if (!fgets(buf, sizeof(buf)-1, fp)) { + if (!fgets(buf, sizeof(buf), fp)) { fclose(fp); return -1; } - while (fgets(buf, sizeof(buf)-1, fp)) { + while (fgets(buf, sizeof(buf), fp)) { sscanf(buf, "%llx %d %d %x %d %d %llx %d", &sk, &prot, &pid, &groups, &rq, &wq, &cb, &rc); From 35f59d862fc9dec1e4af675c5ce776ba44be7eb7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 27 Nov 2015 10:23:43 -0800 Subject: [PATCH 094/151] vxlan: Add support for remote checksum offload This patch adds support to remote checksum checksum offload to VXLAN. This patch adds remcsumtx and remcsumrx to ip vxlan configuration to enable remote checksum offload for transmit and receive on the VXLAN tunnel. https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Example: ip link add name vxlan0 type vxlan id 42 group 239.1.1.1 dev eth0 \ udpcsum remcsumtx remcsumrx Testing: Ran single netperf over mlnx4 to illustrate the effest: - Without RCO (UDP csum set to zero) 4335.99 Mbps - With RCO enabled 7661.81 Mbps Signed-off-by: Tom Herbert --- ip/iplink_vxlan.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c index 473ff97a..db29bf03 100644 --- a/ip/iplink_vxlan.c +++ b/ip/iplink_vxlan.c @@ -30,6 +30,7 @@ static void print_explain(FILE *f) fprintf(f, " [ [no]l2miss ] [ [no]l3miss ]\n"); fprintf(f, " [ ageing SECONDS ] [ maxaddress NUMBER ]\n"); fprintf(f, " [ [no]udpcsum ] [ [no]udp6zerocsumtx ] [ [no]udp6zerocsumrx ]\n"); + fprintf(f, " [ [no]remcsumtx ] [ [no]remcsumrx ]\n"); fprintf(f, " [ gbp ]\n"); fprintf(f, "\n"); fprintf(f, "Where: VNI := 0-16777215\n"); @@ -69,6 +70,8 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, __u8 udpcsum = 0; __u8 udp6zerocsumtx = 0; __u8 udp6zerocsumrx = 0; + __u8 remcsumtx = 0; + __u8 remcsumrx = 0; __u8 gbp = 0; int dst_port_set = 0; struct ifla_vxlan_port_range range = { 0, 0 }; @@ -199,6 +202,14 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, udp6zerocsumrx = 1; } else if (!matches(*argv, "noudp6zerocsumrx")) { udp6zerocsumrx = 0; + } else if (!matches(*argv, "remcsumtx")) { + remcsumtx = 1; + } else if (!matches(*argv, "noremcsumtx")) { + remcsumtx = 0; + } else if (!matches(*argv, "remcsumrx")) { + remcsumrx = 1; + } else if (!matches(*argv, "noremcsumrx")) { + remcsumrx = 0; } else if (!matches(*argv, "gbp")) { gbp = 1; } else if (matches(*argv, "help") == 0) { @@ -259,6 +270,8 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, addattr8(n, 1024, IFLA_VXLAN_UDP_CSUM, udpcsum); addattr8(n, 1024, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, udp6zerocsumtx); addattr8(n, 1024, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, udp6zerocsumrx); + addattr8(n, 1024, IFLA_VXLAN_REMCSUM_TX, remcsumtx); + addattr8(n, 1024, IFLA_VXLAN_REMCSUM_RX, remcsumrx); if (noage) addattr32(n, 1024, IFLA_VXLAN_AGEING, 0); @@ -407,6 +420,14 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) rta_getattr_u8(tb[IFLA_VXLAN_UDP_ZERO_CSUM6_RX])) fputs("udp6zerocsumrx ", f); + if (tb[IFLA_VXLAN_REMCSUM_TX] && + rta_getattr_u8(tb[IFLA_VXLAN_REMCSUM_TX])) + fputs("remcsumtx ", f); + + if (tb[IFLA_VXLAN_REMCSUM_RX] && + rta_getattr_u8(tb[IFLA_VXLAN_REMCSUM_RX])) + fputs("remcsumrx ", f); + if (tb[IFLA_VXLAN_GBP]) fputs("gbp ", f); } From 910b543dcce52290ce723758e1d9bb436188a26b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 26 Nov 2015 15:38:42 +0100 Subject: [PATCH 095/151] {f,m}_bpf: make tail calls working Now that we have the possibility of sharing maps, it's time we get the ELF loader fully working with regards to tail calls. Since program array maps are pinned, we can keep them finally alive. I've noticed two bugs that are being fixed in bpf_fill_prog_arrays() with this patch. Example code comes as follow-up. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- tc/tc_bpf.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c index bc7bc9ff..c3adc23c 100644 --- a/tc/tc_bpf.c +++ b/tc/tc_bpf.c @@ -1139,11 +1139,22 @@ static int bpf_fetch_prog_sec(struct bpf_elf_ctx *ctx, const char *section) return ret; } +static int bpf_find_map_by_id(struct bpf_elf_ctx *ctx, uint32_t id) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ctx->map_fds); i++) + if (ctx->map_fds[i] && ctx->maps[i].id == id && + ctx->maps[i].type == BPF_MAP_TYPE_PROG_ARRAY) + return i; + return -1; +} + static int bpf_fill_prog_arrays(struct bpf_elf_ctx *ctx) { struct bpf_elf_sec_data data; uint32_t map_id, key_id; - int fd, i, ret; + int fd, i, ret, idx; for (i = 1; i < ctx->elf_hdr.e_shnum; i++) { if (ctx->sec_done[i]) @@ -1153,20 +1164,20 @@ static int bpf_fill_prog_arrays(struct bpf_elf_ctx *ctx) if (ret < 0) continue; - ret = sscanf(data.sec_name, "%u/%u", &map_id, &key_id); - if (ret != 2 || map_id >= ARRAY_SIZE(ctx->map_fds) || - !ctx->map_fds[map_id]) + ret = sscanf(data.sec_name, "%i/%i", &map_id, &key_id); + if (ret != 2) continue; - if (ctx->maps[map_id].type != BPF_MAP_TYPE_PROG_ARRAY || - ctx->maps[map_id].max_elem <= key_id) + + idx = bpf_find_map_by_id(ctx, map_id); + if (idx < 0) continue; fd = bpf_fetch_prog_sec(ctx, data.sec_name); if (fd < 0) return -EIO; - ret = bpf_map_update(ctx->map_fds[map_id], &key_id, - &fd, BPF_NOEXIST); + ret = bpf_map_update(ctx->map_fds[idx], &key_id, + &fd, BPF_ANY); if (ret < 0) return -ENOENT; From 9e607f2e722604a57a2c1ec9a174fcc505d9c451 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 26 Nov 2015 15:38:43 +0100 Subject: [PATCH 096/151] {f, m}_bpf: check map attributes when fetching as pinned Make use of the new show_fdinfo() facility and verify that when a pinned map is being fetched that its basic attributes are the same as the map we declared from the ELF file. I.e. when placed into the globalns, collisions could occur. In such a case warn the user and bail out. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- tc/tc_bpf.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c index c3adc23c..b44b1237 100644 --- a/tc/tc_bpf.c +++ b/tc/tc_bpf.c @@ -205,6 +205,52 @@ void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len) ops[i].jf, ops[i].k); } +static int bpf_map_selfcheck_pinned(int fd, const struct bpf_elf_map *map) +{ + char file[PATH_MAX], buff[4096]; + struct bpf_elf_map tmp, zero; + unsigned int val; + FILE *fp; + + snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd); + + fp = fopen(file, "r"); + if (!fp) { + fprintf(stderr, "No procfs support?!\n"); + return -EIO; + } + + memset(&tmp, 0, sizeof(tmp)); + while (fgets(buff, sizeof(buff), fp)) { + if (sscanf(buff, "map_type:\t%u", &val) == 1) + tmp.type = val; + else if (sscanf(buff, "key_size:\t%u", &val) == 1) + tmp.size_key = val; + else if (sscanf(buff, "value_size:\t%u", &val) == 1) + tmp.size_value = val; + else if (sscanf(buff, "max_entries:\t%u", &val) == 1) + tmp.max_elem = val; + } + + fclose(fp); + + if (!memcmp(&tmp, map, offsetof(struct bpf_elf_map, id))) { + return 0; + } else { + memset(&zero, 0, sizeof(zero)); + /* If kernel doesn't have eBPF-related fdinfo, we cannot do much, + * so just accept it. We know we do have an eBPF fd and in this + * case, everything is 0. It is guaranteed that no such map exists + * since map type of 0 is unloadable BPF_MAP_TYPE_UNSPEC. + */ + if (!memcmp(&tmp, &zero, offsetof(struct bpf_elf_map, id))) + return 0; + + fprintf(stderr, "Map specs from pinned file differ!\n"); + return -EINVAL; + } +} + static int bpf_valid_mntpt(const char *mnt, unsigned long magic) { struct statfs st_fs; @@ -816,6 +862,13 @@ static int bpf_map_attach(const char *name, const struct bpf_elf_map *map, fd = bpf_probe_pinned(name, map->pinning); if (fd > 0) { + ret = bpf_map_selfcheck_pinned(fd, map); + if (ret < 0) { + close(fd); + fprintf(stderr, "Map \'%s\' self-check failed!\n", + name); + return ret; + } if (verbose) fprintf(stderr, "Map \'%s\' loaded as pinned!\n", name); From f6793eec4600a9f9428026ed75c50a44eeb3c83f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 26 Nov 2015 15:38:44 +0100 Subject: [PATCH 097/151] {f, m}_bpf: allow for user-defined object pinnings The recently introduced object pinning can be further extended in order to allow sharing maps beyond tc namespace. F.e. maps that are being pinned from tracing side, can be accessed through this facility as well. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- etc/iproute2/bpf_pinning | 6 ++ include/bpf_elf.h | 2 +- include/utils.h | 4 + lib/rt_names.c | 5 +- tc/tc_bpf.c | 212 +++++++++++++++++++++++++++++++++++---- 5 files changed, 204 insertions(+), 25 deletions(-) create mode 100644 etc/iproute2/bpf_pinning diff --git a/etc/iproute2/bpf_pinning b/etc/iproute2/bpf_pinning new file mode 100644 index 00000000..2b39c709 --- /dev/null +++ b/etc/iproute2/bpf_pinning @@ -0,0 +1,6 @@ +# +# subpath mappings from mount point for pinning +# +#3 tracing +#4 foo/bar +#5 tc/cls1 diff --git a/include/bpf_elf.h b/include/bpf_elf.h index 0690dd6a..31a89743 100644 --- a/include/bpf_elf.h +++ b/include/bpf_elf.h @@ -33,7 +33,7 @@ struct bpf_elf_map { __u32 size_value; __u32 max_elem; __u32 id; - __u8 pinning; + __u32 pinning; }; #endif /* __BPF_ELF__ */ diff --git a/include/utils.h b/include/utils.h index 5902a985..e830be64 100644 --- a/include/utils.h +++ b/include/utils.h @@ -40,6 +40,10 @@ extern bool do_all; #define IPSEC_PROTO_ANY 255 #endif +#ifndef CONFDIR +#define CONFDIR "/etc/iproute2" +#endif + #define SPRINT_BSIZE 64 #define SPRINT_BUF(x) char x[SPRINT_BSIZE] diff --git a/lib/rt_names.c b/lib/rt_names.c index 1071a938..f6d17c0e 100644 --- a/lib/rt_names.c +++ b/lib/rt_names.c @@ -23,10 +23,7 @@ #include #include "rt_names.h" - -#ifndef CONFDIR -#define CONFDIR "/etc/iproute2" -#endif +#include "utils.h" #define NAME_MAX_LEN 512 diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c index b44b1237..17c04e9b 100644 --- a/tc/tc_bpf.c +++ b/tc/tc_bpf.c @@ -458,6 +458,12 @@ struct bpf_elf_prog { const char *license; }; +struct bpf_hash_entry { + unsigned int pinning; + const char *subpath; + struct bpf_hash_entry *next; +}; + struct bpf_elf_ctx { Elf *elf_fd; GElf_Ehdr elf_hdr; @@ -474,6 +480,7 @@ struct bpf_elf_ctx { enum bpf_prog_type type; bool verbose; struct bpf_elf_st stat; + struct bpf_hash_entry *ht[256]; }; struct bpf_elf_sec_data { @@ -771,20 +778,34 @@ static int bpf_init_env(const char *pathname) return 0; } -static bool bpf_no_pinning(int pinning) +static const char *bpf_custom_pinning(const struct bpf_elf_ctx *ctx, + uint32_t pinning) +{ + struct bpf_hash_entry *entry; + + entry = ctx->ht[pinning & (ARRAY_SIZE(ctx->ht) - 1)]; + while (entry && entry->pinning != pinning) + entry = entry->next; + + return entry ? entry->subpath : NULL; +} + +static bool bpf_no_pinning(const struct bpf_elf_ctx *ctx, + uint32_t pinning) { switch (pinning) { case PIN_OBJECT_NS: case PIN_GLOBAL_NS: return false; case PIN_NONE: - default: return true; + default: + return !bpf_custom_pinning(ctx, pinning); } } static void bpf_make_pathname(char *pathname, size_t len, const char *name, - int pinning) + const struct bpf_elf_ctx *ctx, uint32_t pinning) { switch (pinning) { case PIN_OBJECT_NS: @@ -795,41 +816,89 @@ static void bpf_make_pathname(char *pathname, size_t len, const char *name, snprintf(pathname, len, "%s/%s/%s", bpf_get_tc_dir(), BPF_DIR_GLOBALS, name); break; + default: + snprintf(pathname, len, "%s/../%s/%s", bpf_get_tc_dir(), + bpf_custom_pinning(ctx, pinning), name); + break; } } -static int bpf_probe_pinned(const char *name, int pinning) +static int bpf_probe_pinned(const char *name, const struct bpf_elf_ctx *ctx, + uint32_t pinning) { char pathname[PATH_MAX]; - if (bpf_no_pinning(pinning) || !bpf_get_tc_dir()) + if (bpf_no_pinning(ctx, pinning) || !bpf_get_tc_dir()) return 0; - bpf_make_pathname(pathname, sizeof(pathname), name, pinning); + bpf_make_pathname(pathname, sizeof(pathname), name, ctx, pinning); return bpf_obj_get(pathname); } -static int bpf_place_pinned(int fd, const char *name, int pinning) +static int bpf_make_obj_path(void) { - char pathname[PATH_MAX]; + char tmp[PATH_MAX]; int ret; - if (bpf_no_pinning(pinning) || !bpf_get_tc_dir()) - return 0; + snprintf(tmp, sizeof(tmp), "%s/%s", bpf_get_tc_dir(), + bpf_get_obj_uid(NULL)); - if (pinning == PIN_OBJECT_NS) { - snprintf(pathname, sizeof(pathname), "%s/%s", - bpf_get_tc_dir(), bpf_get_obj_uid(NULL)); + ret = mkdir(tmp, S_IRWXU); + if (ret && errno != EEXIST) { + fprintf(stderr, "mkdir %s failed: %s\n", tmp, strerror(errno)); + return ret; + } - ret = mkdir(pathname, S_IRWXU); + return 0; +} + +static int bpf_make_custom_path(const char *todo) +{ + char tmp[PATH_MAX], rem[PATH_MAX], *sub; + int ret; + + snprintf(tmp, sizeof(tmp), "%s/../", bpf_get_tc_dir()); + snprintf(rem, sizeof(rem), "%s/", todo); + sub = strtok(rem, "/"); + + while (sub) { + if (strlen(tmp) + strlen(sub) + 2 > PATH_MAX) + return -EINVAL; + + strcat(tmp, sub); + strcat(tmp, "/"); + + ret = mkdir(tmp, S_IRWXU); if (ret && errno != EEXIST) { - fprintf(stderr, "mkdir %s failed: %s\n", pathname, + fprintf(stderr, "mkdir %s failed: %s\n", tmp, strerror(errno)); return ret; } + + sub = strtok(NULL, "/"); } - bpf_make_pathname(pathname, sizeof(pathname), name, pinning); + return 0; +} + +static int bpf_place_pinned(int fd, const char *name, + const struct bpf_elf_ctx *ctx, uint32_t pinning) +{ + char pathname[PATH_MAX]; + const char *tmp; + int ret = 0; + + if (bpf_no_pinning(ctx, pinning) || !bpf_get_tc_dir()) + return 0; + + if (pinning == PIN_OBJECT_NS) + ret = bpf_make_obj_path(); + else if ((tmp = bpf_custom_pinning(ctx, pinning))) + ret = bpf_make_custom_path(tmp); + if (ret < 0) + return ret; + + bpf_make_pathname(pathname, sizeof(pathname), name, ctx, pinning); return bpf_obj_pin(fd, pathname); } @@ -856,11 +925,11 @@ static int bpf_prog_attach(const char *section, } static int bpf_map_attach(const char *name, const struct bpf_elf_map *map, - bool verbose) + const struct bpf_elf_ctx *ctx, bool verbose) { int fd, ret; - fd = bpf_probe_pinned(name, map->pinning); + fd = bpf_probe_pinned(name, ctx, map->pinning); if (fd > 0) { ret = bpf_map_selfcheck_pinned(fd, map); if (ret < 0) { @@ -889,7 +958,7 @@ static int bpf_map_attach(const char *name, const struct bpf_elf_map *map, return fd; } - ret = bpf_place_pinned(fd, name, map->pinning); + ret = bpf_place_pinned(fd, name, ctx, map->pinning); if (ret < 0 && errno != EEXIST) { fprintf(stderr, "Could not pin %s map: %s\n", name, strerror(errno)); @@ -940,7 +1009,8 @@ static int bpf_maps_attach_all(struct bpf_elf_ctx *ctx) if (!map_name) return -EIO; - fd = bpf_map_attach(map_name, &ctx->maps[i], ctx->verbose); + fd = bpf_map_attach(map_name, &ctx->maps[i], ctx, + ctx->verbose); if (fd < 0) return fd; @@ -1258,6 +1328,105 @@ static void bpf_save_finfo(struct bpf_elf_ctx *ctx) ctx->stat.st_ino = st.st_ino; } +static int bpf_read_pin_mapping(FILE *fp, uint32_t *id, char *path) +{ + char buff[PATH_MAX]; + + while (fgets(buff, sizeof(buff), fp)) { + char *ptr = buff; + + while (*ptr == ' ' || *ptr == '\t') + ptr++; + + if (*ptr == '#' || *ptr == '\n' || *ptr == 0) + continue; + + if (sscanf(ptr, "%i %s\n", id, path) != 2 && + sscanf(ptr, "%i %s #", id, path) != 2) { + strcpy(path, ptr); + return -1; + } + + return 1; + } + + return 0; +} + +static bool bpf_pinning_reserved(uint32_t pinning) +{ + switch (pinning) { + case PIN_NONE: + case PIN_OBJECT_NS: + case PIN_GLOBAL_NS: + return true; + default: + return false; + } +} + +static void bpf_hash_init(struct bpf_elf_ctx *ctx, const char *db_file) +{ + struct bpf_hash_entry *entry; + char subpath[PATH_MAX]; + uint32_t pinning; + FILE *fp; + int ret; + + fp = fopen(db_file, "r"); + if (!fp) + return; + + memset(subpath, 0, sizeof(subpath)); + while ((ret = bpf_read_pin_mapping(fp, &pinning, subpath))) { + if (ret == -1) { + fprintf(stderr, "Database %s is corrupted at: %s\n", + db_file, subpath); + fclose(fp); + return; + } + + if (bpf_pinning_reserved(pinning)) { + fprintf(stderr, "Database %s, id %u is reserved - " + "ignoring!\n", db_file, pinning); + continue; + } + + entry = malloc(sizeof(*entry)); + if (!entry) { + fprintf(stderr, "No memory left for db entry!\n"); + continue; + } + + entry->pinning = pinning; + entry->subpath = strdup(subpath); + if (!entry->subpath) { + fprintf(stderr, "No memory left for db entry!\n"); + free(entry); + continue; + } + + entry->next = ctx->ht[pinning & (ARRAY_SIZE(ctx->ht) - 1)]; + ctx->ht[pinning & (ARRAY_SIZE(ctx->ht) - 1)] = entry; + } + + fclose(fp); +} + +static void bpf_hash_destroy(struct bpf_elf_ctx *ctx) +{ + struct bpf_hash_entry *entry; + int i; + + for (i = 0; i < ARRAY_SIZE(ctx->ht); i++) { + while ((entry = ctx->ht[i]) != NULL) { + ctx->ht[i] = entry->next; + free((char *)entry->subpath); + free(entry); + } + } +} + static int bpf_elf_ctx_init(struct bpf_elf_ctx *ctx, const char *pathname, enum bpf_prog_type type, bool verbose) { @@ -1295,6 +1464,8 @@ static int bpf_elf_ctx_init(struct bpf_elf_ctx *ctx, const char *pathname, } bpf_save_finfo(ctx); + bpf_hash_init(ctx, CONFDIR "/bpf_pinning"); + return 0; out_elf: elf_end(ctx->elf_fd); @@ -1331,6 +1502,7 @@ static void bpf_elf_ctx_destroy(struct bpf_elf_ctx *ctx, bool failure) if (failure) bpf_maps_teardown(ctx); + bpf_hash_destroy(ctx); free(ctx->sec_done); elf_end(ctx->elf_fd); close(ctx->obj_fd); From 91d88eeb10cd4f51e3b5c675c7aee4ae1e41ff16 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 26 Nov 2015 15:38:45 +0100 Subject: [PATCH 098/151] {f,m}_bpf: allow updates on program arrays Since we have all infrastructure in place now, allow atomic live updates on program arrays. This can be very useful e.g. in case programs that are being tail-called need to be replaced, f.e. when classifier functionality needs to be changed, new protocols added/removed during runtime, etc. Thus, provide a way for in-place code updates, minimal example: Given is an object file cls.o that contains the entry point in section 'classifier', has a globally pinned program array 'jmp' with 2 slots and id of 0, and two tail called programs under section '0/0' (prog array key 0) and '0/1' (prog array key 1), the section encoding for the loader is . Adding the filter loads everything into cls_bpf: tc filter add dev foo parent ffff: bpf da obj cls.o Now, the program under section '0/1' needs to be replaced with an updated version that resides in the same section (also full path to tc's subfolder of the mount point can be passed, e.g. /sys/fs/bpf/tc/globals/jmp): tc exec bpf graft m:globals/jmp obj cls.o sec 0/1 In case the program resides under a different section 'foo', it can also be injected into the program array like: tc exec bpf graft m:globals/jmp key 1 obj cls.o sec foo If the new tail called classifier program is already available as a pinned object somewhere (here: /sys/fs/bpf/tc/progs/parser), it can be injected into the prog array like: tc exec bpf graft m:globals/jmp key 1 fd m:progs/parser In the kernel, the program on key 1 is being atomically replaced and the old one's refcount dropped. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- tc/e_bpf.c | 30 +++- tc/tc_bpf.c | 430 ++++++++++++++++++++++++++++++++++------------------ tc/tc_bpf.h | 1 + 3 files changed, 309 insertions(+), 152 deletions(-) diff --git a/tc/e_bpf.c b/tc/e_bpf.c index 1f386c36..2d650a46 100644 --- a/tc/e_bpf.c +++ b/tc/e_bpf.c @@ -26,10 +26,19 @@ static char *argv_default[] = { BPF_DEFAULT_CMD, NULL }; static void explain(void) { - fprintf(stderr, "Usage: ... bpf [ import UDS_FILE ] [ run CMD ] [ debug ]\n\n"); + fprintf(stderr, "Usage: ... bpf [ import UDS_FILE ] [ run CMD ]\n"); + fprintf(stderr, " ... bpf [ debug ]\n"); + fprintf(stderr, " ... bpf [ graft MAP_FILE ] [ key KEY ]\n"); + fprintf(stderr, " `... [ object-file OBJ_FILE ] [ type TYPE ] [ section NAME ] [ verbose ]\n"); + fprintf(stderr, " `... [ object-pinned PROG_FILE ]\n"); + fprintf(stderr, "\n"); fprintf(stderr, "Where UDS_FILE provides the name of a unix domain socket file\n"); fprintf(stderr, "to import eBPF maps and the optional CMD denotes the command\n"); fprintf(stderr, "to be executed (default: \'%s\').\n", BPF_DEFAULT_CMD); + fprintf(stderr, "Where MAP_FILE points to a pinned map, OBJ_FILE to an object file\n"); + fprintf(stderr, "and PROG_FILE to a pinned program. TYPE can be {cls, act}, where\n"); + fprintf(stderr, "\'cls\' is default. KEY is optional and can be inferred from the\n"); + fprintf(stderr, "section name, otherwise it needs to be provided.\n"); } static int bpf_num_env_entries(void) @@ -67,6 +76,25 @@ static int parse_bpf(struct exec_util *eu, int argc, char **argv) fprintf(stderr, "No trace pipe, tracefs not mounted?\n"); return -1; + } else if (matches(*argv, "graft") == 0) { + const char *bpf_map_path; + bool has_key = false; + uint32_t key; + + NEXT_ARG(); + bpf_map_path = *argv; + NEXT_ARG(); + if (matches(*argv, "key") == 0) { + NEXT_ARG(); + if (get_unsigned(&key, *argv, 0)) { + fprintf(stderr, "Illegal \"key\"\n"); + return -1; + } + has_key = true; + NEXT_ARG(); + } + return bpf_graft_map(bpf_map_path, has_key ? + &key : NULL, argc, argv); } else { explain(); return -1; diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c index 17c04e9b..beb74be6 100644 --- a/tc/tc_bpf.c +++ b/tc/tc_bpf.c @@ -76,13 +76,17 @@ static int bpf(int cmd, union bpf_attr *attr, unsigned int size) #endif } -static int bpf_obj_get(const char *pathname) +static int bpf_map_update(int fd, const void *key, const void *value, + uint64_t flags) { union bpf_attr attr = { - .pathname = bpf_ptr_to_u64(pathname), + .map_fd = fd, + .key = bpf_ptr_to_u64(key), + .value = bpf_ptr_to_u64(value), + .flags = flags, }; - return bpf(BPF_OBJ_GET, &attr, sizeof(attr)); + return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); } static int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len, @@ -205,7 +209,8 @@ void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len) ops[i].jf, ops[i].k); } -static int bpf_map_selfcheck_pinned(int fd, const struct bpf_elf_map *map) +static int bpf_map_selfcheck_pinned(int fd, const struct bpf_elf_map *map, + int length) { char file[PATH_MAX], buff[4096]; struct bpf_elf_map tmp, zero; @@ -234,7 +239,7 @@ static int bpf_map_selfcheck_pinned(int fd, const struct bpf_elf_map *map) fclose(fp); - if (!memcmp(&tmp, map, offsetof(struct bpf_elf_map, id))) { + if (!memcmp(&tmp, map, length)) { return 0; } else { memset(&zero, 0, sizeof(zero)); @@ -243,7 +248,7 @@ static int bpf_map_selfcheck_pinned(int fd, const struct bpf_elf_map *map) * case, everything is 0. It is guaranteed that no such map exists * since map type of 0 is unloadable BPF_MAP_TYPE_UNSPEC. */ - if (!memcmp(&tmp, &zero, offsetof(struct bpf_elf_map, id))) + if (!memcmp(&tmp, &zero, length)) return 0; fprintf(stderr, "Map specs from pinned file differ!\n"); @@ -251,6 +256,35 @@ static int bpf_map_selfcheck_pinned(int fd, const struct bpf_elf_map *map) } } +static int bpf_mnt_fs(const char *target) +{ + bool bind_done = false; + + while (mount("", target, "none", MS_PRIVATE | MS_REC, NULL)) { + if (errno != EINVAL || bind_done) { + fprintf(stderr, "mount --make-private %s failed: %s\n", + target, strerror(errno)); + return -1; + } + + if (mount(target, target, "none", MS_BIND, NULL)) { + fprintf(stderr, "mount --bind %s %s failed: %s\n", + target, target, strerror(errno)); + return -1; + } + + bind_done = true; + } + + if (mount("bpf", target, "bpf", 0, NULL)) { + fprintf(stderr, "mount -t bpf bpf %s failed: %s\n", + target, strerror(errno)); + return -1; + } + + return 0; +} + static int bpf_valid_mntpt(const char *mnt, unsigned long magic) { struct statfs st_fs; @@ -342,6 +376,79 @@ int bpf_trace_pipe(void) return 0; } +static const char *bpf_get_tc_dir(void) +{ + static bool bpf_mnt_cached = false; + static char bpf_tc_dir[PATH_MAX]; + static const char *mnt; + static const char * const bpf_known_mnts[] = { + BPF_DIR_MNT, + 0, + }; + char bpf_mnt[PATH_MAX] = BPF_DIR_MNT; + char bpf_glo_dir[PATH_MAX]; + int ret; + + if (bpf_mnt_cached) + goto done; + + mnt = bpf_find_mntpt("bpf", BPF_FS_MAGIC, bpf_mnt, sizeof(bpf_mnt), + bpf_known_mnts); + if (!mnt) { + mnt = getenv(BPF_ENV_MNT); + if (!mnt) + mnt = BPF_DIR_MNT; + ret = bpf_mnt_fs(mnt); + if (ret) { + mnt = NULL; + goto out; + } + } + + snprintf(bpf_tc_dir, sizeof(bpf_tc_dir), "%s/%s", mnt, BPF_DIR_TC); + ret = mkdir(bpf_tc_dir, S_IRWXU); + if (ret && errno != EEXIST) { + fprintf(stderr, "mkdir %s failed: %s\n", bpf_tc_dir, + strerror(errno)); + mnt = NULL; + goto out; + } + + snprintf(bpf_glo_dir, sizeof(bpf_glo_dir), "%s/%s", + bpf_tc_dir, BPF_DIR_GLOBALS); + ret = mkdir(bpf_glo_dir, S_IRWXU); + if (ret && errno != EEXIST) { + fprintf(stderr, "mkdir %s failed: %s\n", bpf_glo_dir, + strerror(errno)); + mnt = NULL; + goto out; + } + + mnt = bpf_tc_dir; +out: + bpf_mnt_cached = true; +done: + return mnt; +} + +static int bpf_obj_get(const char *pathname) +{ + union bpf_attr attr; + char tmp[PATH_MAX]; + + if (strlen(pathname) > 2 && pathname[0] == 'm' && + pathname[1] == ':' && bpf_get_tc_dir()) { + snprintf(tmp, sizeof(tmp), "%s/%s", + bpf_get_tc_dir(), pathname + 2); + pathname = tmp; + } + + memset(&attr, 0, sizeof(attr)); + attr.pathname = bpf_ptr_to_u64(pathname); + + return bpf(BPF_OBJ_GET, &attr, sizeof(attr)); +} + const char *bpf_default_section(const enum bpf_prog_type type) { switch (type) { @@ -354,37 +461,45 @@ const char *bpf_default_section(const enum bpf_prog_type type) } } -int bpf_parse_common(int *ptr_argc, char ***ptr_argv, const int *nla_tbl, - enum bpf_prog_type type, const char **ptr_object, - const char **ptr_uds_name, struct nlmsghdr *n) -{ - struct sock_filter opcodes[BPF_MAXINSNS]; - const char *file, *section, *uds_name; - char **argv = *ptr_argv; - int argc = *ptr_argc; - char annotation[256]; - bool verbose = false; - int ret; - enum bpf_mode { - CBPF_BYTECODE, - CBPF_FILE, - EBPF_OBJECT, - EBPF_PINNED, - } mode; +enum bpf_mode { + CBPF_BYTECODE = 0, + CBPF_FILE, + EBPF_OBJECT, + EBPF_PINNED, + __BPF_MODE_MAX, +#define BPF_MODE_MAX __BPF_MODE_MAX +}; - if (matches(*argv, "bytecode") == 0 || - strcmp(*argv, "bc") == 0) { - mode = CBPF_BYTECODE; - } else if (matches(*argv, "bytecode-file") == 0 || - strcmp(*argv, "bcf") == 0) { - mode = CBPF_FILE; - } else if (matches(*argv, "object-file") == 0 || - strcmp(*argv, "obj") == 0) { - mode = EBPF_OBJECT; - } else if (matches(*argv, "object-pinned") == 0 || - matches(*argv, "pinned") == 0 || - matches(*argv, "fd") == 0) { - mode = EBPF_PINNED; +static int bpf_parse(int *ptr_argc, char ***ptr_argv, const bool *opt_tbl, + enum bpf_prog_type *type, enum bpf_mode *mode, + const char **ptr_object, const char **ptr_section, + const char **ptr_uds_name, struct sock_filter *opcodes) +{ + const char *file, *section, *uds_name; + bool verbose = false; + int ret, argc; + char **argv; + + argv = *ptr_argv; + argc = *ptr_argc; + + if (opt_tbl[CBPF_BYTECODE] && + (matches(*argv, "bytecode") == 0 || + strcmp(*argv, "bc") == 0)) { + *mode = CBPF_BYTECODE; + } else if (opt_tbl[CBPF_FILE] && + (matches(*argv, "bytecode-file") == 0 || + strcmp(*argv, "bcf") == 0)) { + *mode = CBPF_FILE; + } else if (opt_tbl[EBPF_OBJECT] && + (matches(*argv, "object-file") == 0 || + strcmp(*argv, "obj") == 0)) { + *mode = EBPF_OBJECT; + } else if (opt_tbl[EBPF_PINNED] && + (matches(*argv, "object-pinned") == 0 || + matches(*argv, "pinned") == 0 || + matches(*argv, "fd") == 0)) { + *mode = EBPF_PINNED; } else { fprintf(stderr, "What mode is \"%s\"?\n", *argv); return -1; @@ -392,11 +507,29 @@ int bpf_parse_common(int *ptr_argc, char ***ptr_argv, const int *nla_tbl, NEXT_ARG(); file = section = uds_name = NULL; - if (mode == EBPF_OBJECT || mode == EBPF_PINNED) { + if (*mode == EBPF_OBJECT || *mode == EBPF_PINNED) { file = *argv; NEXT_ARG_FWD(); - section = bpf_default_section(type); + if (*type == BPF_PROG_TYPE_UNSPEC) { + if (argc > 0 && matches(*argv, "type") == 0) { + NEXT_ARG(); + if (matches(*argv, "cls") == 0) { + *type = BPF_PROG_TYPE_SCHED_CLS; + } else if (matches(*argv, "act") == 0) { + *type = BPF_PROG_TYPE_SCHED_ACT; + } else { + fprintf(stderr, "What type is \"%s\"?\n", + *argv); + return -1; + } + NEXT_ARG_FWD(); + } else { + *type = BPF_PROG_TYPE_SCHED_CLS; + } + } + + section = bpf_default_section(*type); if (argc > 0 && matches(*argv, "section") == 0) { NEXT_ARG(); section = *argv; @@ -419,37 +552,127 @@ int bpf_parse_common(int *ptr_argc, char ***ptr_argv, const int *nla_tbl, PREV_ARG(); } - if (mode == CBPF_BYTECODE || mode == CBPF_FILE) - ret = bpf_ops_parse(argc, argv, opcodes, mode == CBPF_FILE); - else if (mode == EBPF_OBJECT) - ret = bpf_obj_open(file, type, section, verbose); - else if (mode == EBPF_PINNED) + if (*mode == CBPF_BYTECODE || *mode == CBPF_FILE) + ret = bpf_ops_parse(argc, argv, opcodes, *mode == CBPF_FILE); + else if (*mode == EBPF_OBJECT) + ret = bpf_obj_open(file, *type, section, verbose); + else if (*mode == EBPF_PINNED) ret = bpf_obj_get(file); - if (ret < 0) + else return -1; + if (ptr_object) + *ptr_object = file; + if (ptr_section) + *ptr_section = section; + if (ptr_uds_name) + *ptr_uds_name = uds_name; + + *ptr_argc = argc; + *ptr_argv = argv; + + return ret; +} + +int bpf_parse_common(int *ptr_argc, char ***ptr_argv, const int *nla_tbl, + enum bpf_prog_type type, const char **ptr_object, + const char **ptr_uds_name, struct nlmsghdr *n) +{ + struct sock_filter opcodes[BPF_MAXINSNS]; + const bool opt_tbl[BPF_MODE_MAX] = { + [CBPF_BYTECODE] = true, + [CBPF_FILE] = true, + [EBPF_OBJECT] = true, + [EBPF_PINNED] = true, + }; + char annotation[256]; + const char *section; + enum bpf_mode mode; + int ret; + + ret = bpf_parse(ptr_argc, ptr_argv, opt_tbl, &type, &mode, + ptr_object, §ion, ptr_uds_name, opcodes); + if (ret < 0) + return ret; + if (mode == CBPF_BYTECODE || mode == CBPF_FILE) { addattr16(n, MAX_MSG, nla_tbl[BPF_NLA_OPS_LEN], ret); addattr_l(n, MAX_MSG, nla_tbl[BPF_NLA_OPS], opcodes, ret * sizeof(struct sock_filter)); - } else if (mode == EBPF_OBJECT || mode == EBPF_PINNED) { + } + + if (mode == EBPF_OBJECT || mode == EBPF_PINNED) { snprintf(annotation, sizeof(annotation), "%s:[%s]", - basename(file), mode == EBPF_PINNED ? "*fsobj" : - section); + basename(*ptr_object), mode == EBPF_PINNED ? + "*fsobj" : section); addattr32(n, MAX_MSG, nla_tbl[BPF_NLA_FD], ret); addattrstrz(n, MAX_MSG, nla_tbl[BPF_NLA_NAME], annotation); } - *ptr_object = file; - *ptr_uds_name = uds_name; - - *ptr_argc = argc; - *ptr_argv = argv; - return 0; } +int bpf_graft_map(const char *map_path, uint32_t *key, int argc, char **argv) +{ + enum bpf_prog_type type = BPF_PROG_TYPE_UNSPEC; + const bool opt_tbl[BPF_MODE_MAX] = { + [CBPF_BYTECODE] = false, + [CBPF_FILE] = false, + [EBPF_OBJECT] = true, + [EBPF_PINNED] = true, + }; + const struct bpf_elf_map test = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .size_key = sizeof(int), + .size_value = sizeof(int), + }; + int ret, prog_fd, map_fd; + const char *section; + enum bpf_mode mode; + uint32_t map_key; + + prog_fd = bpf_parse(&argc, &argv, opt_tbl, &type, &mode, + NULL, §ion, NULL, NULL); + if (prog_fd < 0) + return prog_fd; + if (key) { + map_key = *key; + } else { + ret = sscanf(section, "%*i/%i", &map_key); + if (ret != 1) { + fprintf(stderr, "Couldn\'t infer map key from section " + "name! Please provide \'key\' argument!\n"); + ret = -EINVAL; + goto out_prog; + } + } + + map_fd = bpf_obj_get(map_path); + if (map_fd < 0) { + fprintf(stderr, "Couldn\'t retrieve pinned map \'%s\': %s\n", + map_path, strerror(errno)); + ret = map_fd; + goto out_prog; + } + + ret = bpf_map_selfcheck_pinned(map_fd, &test, + offsetof(struct bpf_elf_map, max_elem)); + if (ret < 0) { + fprintf(stderr, "Map \'%s\' self-check failed!\n", map_path); + goto out_map; + } + + ret = bpf_map_update(map_fd, &map_key, &prog_fd, BPF_ANY); + if (ret < 0) + fprintf(stderr, "Map update failed: %s\n", strerror(errno)); +out_map: + close(map_fd); +out_prog: + close(prog_fd); + return ret; +} + #ifdef HAVE_ELF struct bpf_elf_prog { enum bpf_prog_type type; @@ -530,19 +753,6 @@ static int bpf_map_create(enum bpf_map_type type, unsigned int size_key, return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } -static int bpf_map_update(int fd, const void *key, const void *value, - uint64_t flags) -{ - union bpf_attr attr = { - .map_fd = fd, - .key = bpf_ptr_to_u64(key), - .value = bpf_ptr_to_u64(value), - .flags = flags, - }; - - return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); -} - static int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns, size_t size, const char *license) { @@ -672,90 +882,6 @@ done: return bpf_uid; } -static int bpf_mnt_fs(const char *target) -{ - bool bind_done = false; - - while (mount("", target, "none", MS_PRIVATE | MS_REC, NULL)) { - if (errno != EINVAL || bind_done) { - fprintf(stderr, "mount --make-private %s failed: %s\n", - target, strerror(errno)); - return -1; - } - - if (mount(target, target, "none", MS_BIND, NULL)) { - fprintf(stderr, "mount --bind %s %s failed: %s\n", - target, target, strerror(errno)); - return -1; - } - - bind_done = true; - } - - if (mount("bpf", target, "bpf", 0, NULL)) { - fprintf(stderr, "mount -t bpf bpf %s failed: %s\n", - target, strerror(errno)); - return -1; - } - - return 0; -} - -static const char *bpf_get_tc_dir(void) -{ - static bool bpf_mnt_cached = false; - static char bpf_tc_dir[PATH_MAX]; - static const char *mnt; - static const char * const bpf_known_mnts[] = { - BPF_DIR_MNT, - 0, - }; - char bpf_mnt[PATH_MAX] = BPF_DIR_MNT; - char bpf_glo_dir[PATH_MAX]; - int ret; - - if (bpf_mnt_cached) - goto done; - - mnt = bpf_find_mntpt("bpf", BPF_FS_MAGIC, bpf_mnt, sizeof(bpf_mnt), - bpf_known_mnts); - if (!mnt) { - mnt = getenv(BPF_ENV_MNT); - if (!mnt) - mnt = BPF_DIR_MNT; - ret = bpf_mnt_fs(mnt); - if (ret) { - mnt = NULL; - goto out; - } - } - - snprintf(bpf_tc_dir, sizeof(bpf_tc_dir), "%s/%s", mnt, BPF_DIR_TC); - ret = mkdir(bpf_tc_dir, S_IRWXU); - if (ret && errno != EEXIST) { - fprintf(stderr, "mkdir %s failed: %s\n", bpf_tc_dir, - strerror(errno)); - mnt = NULL; - goto out; - } - - snprintf(bpf_glo_dir, sizeof(bpf_glo_dir), "%s/%s", - bpf_tc_dir, BPF_DIR_GLOBALS); - ret = mkdir(bpf_glo_dir, S_IRWXU); - if (ret && errno != EEXIST) { - fprintf(stderr, "mkdir %s failed: %s\n", bpf_glo_dir, - strerror(errno)); - mnt = NULL; - goto out; - } - - mnt = bpf_tc_dir; -out: - bpf_mnt_cached = true; -done: - return mnt; -} - static int bpf_init_env(const char *pathname) { struct rlimit limit = { @@ -931,7 +1057,9 @@ static int bpf_map_attach(const char *name, const struct bpf_elf_map *map, fd = bpf_probe_pinned(name, ctx, map->pinning); if (fd > 0) { - ret = bpf_map_selfcheck_pinned(fd, map); + ret = bpf_map_selfcheck_pinned(fd, map, + offsetof(struct bpf_elf_map, + id)); if (ret < 0) { close(fd); fprintf(stderr, "Map \'%s\' self-check failed!\n", diff --git a/tc/tc_bpf.h b/tc/tc_bpf.h index dea3c3bc..526d0b12 100644 --- a/tc/tc_bpf.h +++ b/tc/tc_bpf.h @@ -55,6 +55,7 @@ const char *bpf_default_section(const enum bpf_prog_type type); int bpf_parse_common(int *ptr_argc, char ***ptr_argv, const int *nla_tbl, enum bpf_prog_type type, const char **ptr_object, const char **ptr_uds_name, struct nlmsghdr *n); +int bpf_graft_map(const char *map_path, uint32_t *key, int argc, char **argv); void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len); From 0b7e3fc8f1abe63df0d511905b2a09064225f3a5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 26 Nov 2015 15:38:46 +0100 Subject: [PATCH 099/151] {f,m}_bpf: add more example code I've added three examples to examples/bpf/ that demonstrate how one can implement eBPF tail calls in tc with f.e. multiple levels of nesting. That should act as a good starting point, but also as test cases for the ELF loader and kernel. A real test suite for {f,m,e}_bpf is still to be developed in future work. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- examples/bpf/README | 13 ++++ examples/bpf/bpf_cyclic.c | 32 ++++++++++ examples/bpf/bpf_funcs.h | 11 ++++ examples/bpf/bpf_graft.c | 70 ++++++++++++++++++++++ examples/bpf/bpf_tailcall.c | 115 ++++++++++++++++++++++++++++++++++++ 5 files changed, 241 insertions(+) create mode 100644 examples/bpf/README create mode 100644 examples/bpf/bpf_cyclic.c create mode 100644 examples/bpf/bpf_graft.c create mode 100644 examples/bpf/bpf_tailcall.c diff --git a/examples/bpf/README b/examples/bpf/README new file mode 100644 index 00000000..42472578 --- /dev/null +++ b/examples/bpf/README @@ -0,0 +1,13 @@ +eBPF toy code examples (running in kernel) to familiarize yourself +with syntax and features: + + - bpf_prog.c -> Classifier examples with using maps + - bpf_shared.c -> Ingress/egress map sharing example + - bpf_tailcall.c -> Using tail call chains + - bpf_cyclic.c -> Simple cycle as tail calls + - bpf_graft.c -> Demo on altering runtime behaviour + +User space code example: + + - bpf_agent.c -> Counterpart to bpf_prog.c for user + space to transfer/read out map data diff --git a/examples/bpf/bpf_cyclic.c b/examples/bpf/bpf_cyclic.c new file mode 100644 index 00000000..bde061cf --- /dev/null +++ b/examples/bpf/bpf_cyclic.c @@ -0,0 +1,32 @@ +#include + +#include "bpf_funcs.h" + +/* Cyclic dependency example to test the kernel's runtime upper + * bound on loops. + */ +struct bpf_elf_map __section("maps") jmp_tc = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .id = 0xabccba, + .size_key = sizeof(int), + .size_value = sizeof(int), + .pinning = PIN_OBJECT_NS, + .max_elem = 1, +}; + +__section_tail(0xabccba, 0) int cls_loop(struct __sk_buff *skb) +{ + char fmt[] = "cb: %u\n"; + + bpf_printk(fmt, sizeof(fmt), skb->cb[0]++); + bpf_tail_call(skb, &jmp_tc, 0); + return -1; +} + +__section("classifier") int cls_entry(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_tc, 0); + return -1; +} + +char __license[] __section("license") = "GPL"; diff --git a/examples/bpf/bpf_funcs.h b/examples/bpf/bpf_funcs.h index 1369401a..6d058f0e 100644 --- a/examples/bpf/bpf_funcs.h +++ b/examples/bpf/bpf_funcs.h @@ -10,10 +10,18 @@ # define __maybe_unused __attribute__ ((__unused__)) #endif +#ifndef __stringify +# define __stringify(x) #x +#endif + #ifndef __section # define __section(NAME) __attribute__((section(NAME), used)) #endif +#ifndef __section_tail +# define __section_tail(m, x) __section(__stringify(m) "/" __stringify(x)) +#endif + #ifndef offsetof # define offsetof __builtin_offsetof #endif @@ -50,6 +58,9 @@ static unsigned int (*get_prandom_u32)(void) __maybe_unused = static int (*bpf_printk)(const char *fmt, int fmt_size, ...) __maybe_unused = (void *) BPF_FUNC_trace_printk; +static void (*bpf_tail_call)(void *ctx, void *map, int index) __maybe_unused = + (void *) BPF_FUNC_tail_call; + /* LLVM built-in functions that an eBPF C program may use to emit * BPF_LD_ABS and BPF_LD_IND instructions. */ diff --git a/examples/bpf/bpf_graft.c b/examples/bpf/bpf_graft.c new file mode 100644 index 00000000..f36d25a2 --- /dev/null +++ b/examples/bpf/bpf_graft.c @@ -0,0 +1,70 @@ +#include + +#include "bpf_funcs.h" + +/* This example demonstrates how classifier run-time behaviour + * can be altered with tail calls. We start out with an empty + * jmp_tc array, then add section aaa to the array slot 0, and + * later on atomically replace it with section bbb. Note that + * as shown in other examples, the tc loader can prepopulate + * tail called sections, here we start out with an empty one + * on purpose to show it can also be done this way. + * + * tc filter add dev foo parent ffff: bpf obj graft.o + * tc exec bpf dbg + * [...] + * Socket Thread-20229 [001] ..s. 138993.003923: : fallthrough + * -0 [001] ..s. 138993.202265: : fallthrough + * Socket Thread-20229 [001] ..s. 138994.004149: : fallthrough + * [...] + * + * tc exec bpf graft m:globals/jmp_tc key 0 obj graft.o sec aaa + * tc exec bpf dbg + * [...] + * Socket Thread-19818 [002] ..s. 139012.053587: : aaa + * -0 [002] ..s. 139012.172359: : aaa + * Socket Thread-19818 [001] ..s. 139012.173556: : aaa + * [...] + * + * tc exec bpf graft m:globals/jmp_tc key 0 obj graft.o sec bbb + * tc exec bpf dbg + * [...] + * Socket Thread-19818 [002] ..s. 139022.102967: : bbb + * -0 [002] ..s. 139022.155640: : bbb + * Socket Thread-19818 [001] ..s. 139022.156730: : bbb + * [...] + */ +struct bpf_elf_map __section("maps") jmp_tc = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .size_key = sizeof(int), + .size_value = sizeof(int), + .pinning = PIN_GLOBAL_NS, + .max_elem = 1, +}; + +__section("aaa") int cls_aaa(struct __sk_buff *skb) +{ + char fmt[] = "aaa\n"; + + bpf_printk(fmt, sizeof(fmt)); + return -1; +} + +__section("bbb") int cls_bbb(struct __sk_buff *skb) +{ + char fmt[] = "bbb\n"; + + bpf_printk(fmt, sizeof(fmt)); + return -1; +} + +__section("classifier") int cls_entry(struct __sk_buff *skb) +{ + char fmt[] = "fallthrough\n"; + + bpf_tail_call(skb, &jmp_tc, 0); + bpf_printk(fmt, sizeof(fmt)); + return -1; +} + +char __license[] __section("license") = "GPL"; diff --git a/examples/bpf/bpf_tailcall.c b/examples/bpf/bpf_tailcall.c new file mode 100644 index 00000000..f186e575 --- /dev/null +++ b/examples/bpf/bpf_tailcall.c @@ -0,0 +1,115 @@ +#include + +#include "bpf_funcs.h" + +#define ENTRY_INIT 3 +#define ENTRY_0 0 +#define ENTRY_1 1 +#define MAX_JMP_SIZE 2 + +#define FOO 42 +#define BAR 43 + +/* This example doesn't really do anything useful, but it's purpose is to + * demonstrate eBPF tail calls on a very simple example. + * + * cls_entry() is our classifier entry point, from there we jump based on + * skb->hash into cls_case1() or cls_case2(). They are both part of the + * program array jmp_tc. Indicated via __section_tail(), the tc loader + * populates the program arrays with the loaded file descriptors already. + * + * To demonstrate nested jumps, cls_case2() jumps within the same jmp_tc + * array to cls_case1(). And whenever we arrive at cls_case1(), we jump + * into cls_exit(), part of the jump array jmp_ex. + * + * Also, to show it's possible, all programs share map_sh and dump the value + * that the entry point incremented. The sections that are loaded into a + * program array can be atomically replaced during run-time, e.g. to change + * classifier behaviour. + */ +struct bpf_elf_map __section("maps") map_sh = { + .type = BPF_MAP_TYPE_ARRAY, + .size_key = sizeof(int), + .size_value = sizeof(int), + .pinning = PIN_OBJECT_NS, + .max_elem = 1, +}; + +struct bpf_elf_map __section("maps") jmp_tc = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .id = FOO, + .size_key = sizeof(int), + .size_value = sizeof(int), + .pinning = PIN_OBJECT_NS, + .max_elem = MAX_JMP_SIZE, +}; + +struct bpf_elf_map __section("maps") jmp_ex = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .id = BAR, + .size_key = sizeof(int), + .size_value = sizeof(int), + .pinning = PIN_OBJECT_NS, + .max_elem = 1, +}; + +__section_tail(FOO, ENTRY_0) int cls_case1(struct __sk_buff *skb) +{ + char fmt[] = "case1: map-val: %d from:%u\n"; + int key = 0, *val; + + val = bpf_map_lookup_elem(&map_sh, &key); + if (val) + bpf_printk(fmt, sizeof(fmt), *val, skb->cb[0]); + + skb->cb[0] = ENTRY_0; + bpf_tail_call(skb, &jmp_ex, ENTRY_0); + return 0; +} + +__section_tail(FOO, ENTRY_1) int cls_case2(struct __sk_buff *skb) +{ + char fmt[] = "case2: map-val: %d from:%u\n"; + int key = 0, *val; + + val = bpf_map_lookup_elem(&map_sh, &key); + if (val) + bpf_printk(fmt, sizeof(fmt), *val, skb->cb[0]); + + skb->cb[0] = ENTRY_1; + bpf_tail_call(skb, &jmp_tc, ENTRY_0); + return 0; +} + +__section_tail(BAR, ENTRY_0) int cls_exit(struct __sk_buff *skb) +{ + char fmt[] = "exit: map-val: %d from:%u\n"; + int key = 0, *val; + + val = bpf_map_lookup_elem(&map_sh, &key); + if (val) + bpf_printk(fmt, sizeof(fmt), *val, skb->cb[0]); + + /* Termination point. */ + return -1; +} + +__section("classifier") int cls_entry(struct __sk_buff *skb) +{ + char fmt[] = "fallthrough\n"; + int key = 0, *val; + + /* For transferring state, we can use skb->cb[0] ... skb->cb[4]. */ + val = bpf_map_lookup_elem(&map_sh, &key); + if (val) { + __sync_fetch_and_add(val, 1); + + skb->cb[0] = ENTRY_INIT; + bpf_tail_call(skb, &jmp_tc, skb->hash & (MAX_JMP_SIZE - 1)); + } + + bpf_printk(fmt, sizeof(fmt)); + return 0; +} + +char __license[] __section("license") = "GPL"; From cc9c1dfaee04030f1c2a80fb28d99c62ce0fca6a Mon Sep 17 00:00:00 2001 From: Konstantin Shemyak Date: Thu, 26 Nov 2015 18:22:05 +0200 Subject: [PATCH 100/151] ip_tunnel: determine tunnel address family from the tunnel type On 24.11.2015 02:26, Stephen Hemminger wrote: > On Thu, 12 Nov 2015 21:10:08 +0000 > Konstantin Shemyak wrote: > >> When creating an IP tunnel over IPv6, the address family must be passed in >> the option, e.g. >> >> ip -6 tunnel add mode ip6gre local 1::1 remote 2::2 >> >> This makes it impossible to create both IPv4 and IPv6 tunnels in one batch. >> >> In fact the address family option is redundant here, as each tunnel mode is >> relevant for only one address family. >> The patch determines whether the applicable address family is AF_INET6 >> instead of the default AF_INET and makes the "-6" option unnecessary for >> "ip tunnel add". >> >> Signed-off-by: Konstantin Shemyak >> --- >> ip/iptunnel.c | 26 ++++++++++++++++++++++++++ >> testsuite/tests/ip/tunnel/add_tunnel.t | 14 ++++++++++++++ >> 2 files changed, 40 insertions(+) >> create mode 100755 testsuite/tests/ip/tunnel/add_tunnel.t >> >> diff --git a/ip/iptunnel.c b/ip/iptunnel.c >> index 78fa988..7826a37 100644 >> --- a/ip/iptunnel.c >> +++ b/ip/iptunnel.c >> @@ -629,8 +629,34 @@ static int do_6rd(int argc, char **argv) >> return tnl_6rd_ioctl(cmd, medium, &ip6rd); >> } >> >> +static int tunnel_mode_is_ipv6(char *tunnel_mode) { >> + char *ipv6_modes[] = { >> + "ipv6/ipv6", "ip6ip6", >> + "vti6", >> + "ip/ipv6", "ipv4/ipv6", "ipip6", "ip4ip6", >> + "ip6gre", "gre/ipv6", >> + "any/ipv6", "any" >> + }; >> + int i; >> + >> + for (i = 0; i < sizeof(ipv6_modes) / sizeof(char *); i++) { >> + if (strcmp(ipv6_modes[i], tunnel_mode) == 0) >> + return 1; >> + } >> + return 0; >> +} >> + > > The ipv6_modes table should be static const. Thank you for the note! attached the corrected patch. > Also is it possible to use strstr for ipv6 and ip6 or even strchr(tunnel_mode, '6') > to simplify this? There is IPv6 tunnel mode 'any', and IPv4 tunnel mode 'ipv6/ip' (aka 'sit'). It looks to me that attempts to find some substring match would not make the code much shorter, but definitely less readable. Konstantin Shemyak. >From 42d27db0055c3a114fe6eb86d680bef9ec098ad4 Mon Sep 17 00:00:00 2001 From: Konstantin Shemyak Date: Thu, 12 Nov 2015 20:52:02 +0200 Subject: [PATCH] Tunnel address family is determined from the tunnel mode When the tunnel mode already tells the IP address family, "ip tunnel" command determines it and does not require option "-4"/"-6" to be passed. This makes possible creating both IPv4 and IPv6 tunnels in one batch. Signed-off-by: Konstantin Shemyak --- ip/iptunnel.c | 26 ++++++++++++++++++++++++++ testsuite/tests/ip/tunnel/add_tunnel.t | 14 ++++++++++++++ 2 files changed, 40 insertions(+) create mode 100755 testsuite/tests/ip/tunnel/add_tunnel.t diff --git a/ip/iptunnel.c b/ip/iptunnel.c index b9552edc..096bbe4e 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -570,8 +570,34 @@ static int do_6rd(int argc, char **argv) return tnl_6rd_ioctl(cmd, medium, &ip6rd); } +static int tunnel_mode_is_ipv6(char *tunnel_mode) { + static const char *ipv6_modes[] = { + "ipv6/ipv6", "ip6ip6", + "vti6", + "ip/ipv6", "ipv4/ipv6", "ipip6", "ip4ip6", + "ip6gre", "gre/ipv6", + "any/ipv6", "any" + }; + int i; + + for (i = 0; i < sizeof(ipv6_modes) / sizeof(ipv6_modes[0]); i++) { + if (strcmp(ipv6_modes[i], tunnel_mode) == 0) + return 1; + } + return 0; +} + int do_iptunnel(int argc, char **argv) { + int i; + + for (i = 0; i < argc - 1; i++) { + if (strcmp(argv[i], "mode") == 0) { + if (tunnel_mode_is_ipv6(argv[i + 1])) + preferred_family = AF_INET6; + break; + } + } switch (preferred_family) { case AF_UNSPEC: preferred_family = AF_INET; diff --git a/testsuite/tests/ip/tunnel/add_tunnel.t b/testsuite/tests/ip/tunnel/add_tunnel.t new file mode 100755 index 00000000..18f6e370 --- /dev/null +++ b/testsuite/tests/ip/tunnel/add_tunnel.t @@ -0,0 +1,14 @@ +#!/bin/sh + +source lib/generic.sh + +TUNNEL_NAME="tunnel_test_ip" + +ts_log "[Testing add/del tunnels]" + +ts_ip "$0" "Add GRE tunnel over IPv4" tunnel add name $TUNNEL_NAME mode gre local 1.1.1.1 remote 2.2.2.2 +ts_ip "$0" "Del GRE tunnel over IPv4" tunnel del $TUNNEL_NAME + +ts_ip "$0" "Add GRE tunnel over IPv6" tunnel add name $TUNNEL_NAME mode ip6gre local dead:beef::1 remote dead:beef::2 +ts_ip "$0" "Del GRE tunnel over IPv6" tunnel del $TUNNEL_NAME + From a96a5d94c6bd58cb455c66a38cff6077841e7aab Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 29 Nov 2015 12:05:39 -0800 Subject: [PATCH 101/151] iptunnel: cleanup code Make iptunnel pass checkpatch (mostly). --- ip/iptunnel.c | 57 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/ip/iptunnel.c b/ip/iptunnel.c index 096bbe4e..ce9ee320 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -50,7 +50,8 @@ static void usage(void) static void set_tunnel_proto(struct ip_tunnel_parm *p, int proto) { if (p->iph.protocol && p->iph.protocol != proto) { - fprintf(stderr,"You managed to ask for more than one tunnel mode.\n"); + fprintf(stderr, + "You managed to ask for more than one tunnel mode.\n"); exit(-1); } p->iph.protocol = proto; @@ -91,7 +92,8 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) set_tunnel_proto(p, IPPROTO_IPIP); p->i_flags |= VTI_ISVTI; } else { - fprintf(stderr,"Unknown tunnel mode \"%s\"\n", *argv); + fprintf(stderr, + "Unknown tunnel mode \"%s\"\n", *argv); exit(-1); } } else if (strcmp(*argv, "key") == 0) { @@ -144,6 +146,7 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) strcmp(*argv, "hoplimit") == 0 || strcmp(*argv, "hlim") == 0) { __u8 uval; + NEXT_ARG(); if (strcmp(*argv, "inherit") != 0) { if (get_u8(&uval, *argv, 0)) @@ -155,6 +158,7 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) matches(*argv, "dsfield") == 0) { char *dsfield; __u32 uval; + NEXT_ARG(); dsfield = *argv; strsep(&dsfield, "/"); @@ -169,15 +173,17 @@ static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) p->iph.tos |= uval; } } else { - if (strcmp(*argv, "name") == 0) { + if (strcmp(*argv, "name") == 0) NEXT_ARG(); - } else if (matches(*argv, "help") == 0) + else if (matches(*argv, "help") == 0) usage(); + if (p->name[0]) duparg2("name", *argv); strncpy(p->name, *argv, IFNAMSIZ - 1); if (cmd == SIOCCHGTUNNEL && count == 0) { struct ip_tunnel_parm old_p; + memset(&old_p, 0, sizeof(old_p)); if (tnl_get_ioctl(*argv, &old_p)) return -1; @@ -268,8 +274,10 @@ static int do_add(int cmd, int argc, char **argv) return -1; } - if (!(basedev = tnl_defname(&p))) { - fprintf(stderr, "cannot determine tunnel mode (ipip, gre, vti or sit)\n"); + basedev = tnl_defname(&p); + if (!basedev) { + fprintf(stderr, + "cannot determine tunnel mode (ipip, gre, vti or sit)\n"); return -1; } @@ -312,18 +320,18 @@ static void print_tunnel(struct ip_tunnel_parm *p) prl[0].addr = htonl(INADDR_ANY); if (!tnl_prl_ioctl(SIOCGETPRL, p->name, prl)) - for (i = 1; i < sizeof(prl) / sizeof(prl[0]); i++) - { - if (prl[i].addr != htonl(INADDR_ANY)) { - printf(" %s %s ", - (prl[i].flags & PRL_DEFAULT) ? "pdr" : "pr", - format_host(AF_INET, 4, &prl[i].addr, s1, sizeof(s1))); + for (i = 1; i < ARRAY_SIZE(prl); i++) { + if (prl[i].addr != htonl(INADDR_ANY)) { + printf(" %s %s ", + (prl[i].flags & PRL_DEFAULT) ? "pdr" : "pr", + format_host(AF_INET, 4, &prl[i].addr, s1, sizeof(s1))); + } } - } } if (p->link) { const char *n = ll_index_to_name(p->link); + if (n) printf(" dev %s", n); } @@ -381,6 +389,7 @@ static int do_tunnels_list(struct ip_tunnel_parm *p) char buf[512]; int err = -1; FILE *fp = fopen("/proc/net/dev", "r"); + if (fp == NULL) { perror("fopen"); return -1; @@ -404,7 +413,8 @@ static int do_tunnels_list(struct ip_tunnel_parm *p) char *ptr; buf[sizeof(buf) - 1] = 0; - if ((ptr = strchr(buf, ':')) == NULL || + ptr = strchr(buf, ':'); + if (ptr == NULL || (*ptr++ = 0, sscanf(buf, "%s", name) != 1)) { fprintf(stderr, "Wrong format for /proc/net/dev. Giving up.\n"); goto end; @@ -463,7 +473,8 @@ static int do_show(int argc, char **argv) if (parse_args(argc, argv, SIOCGETTUNNEL, &p) < 0) return -1; - if (!(basedev = tnl_defname(&p))) + basedev = tnl_defname(&p); + if (!basedev) return do_tunnels_list(&p); if (tnl_get_ioctl(p.name[0] ? p.name : basedev, &p)) @@ -507,11 +518,13 @@ static int do_prl(int argc, char **argv) strncpy(medium, *argv, IFNAMSIZ-1); devname++; } else { - fprintf(stderr,"Invalid PRL parameter \"%s\"\n", *argv); + fprintf(stderr, + "Invalid PRL parameter \"%s\"\n", *argv); exit(-1); } if (count > 1) { - fprintf(stderr,"One PRL entry at a time\n"); + fprintf(stderr, + "One PRL entry at a time\n"); exit(-1); } argc--; argv++; @@ -557,7 +570,8 @@ static int do_6rd(int argc, char **argv) strncpy(medium, *argv, IFNAMSIZ-1); devname++; } else { - fprintf(stderr,"Invalid 6RD parameter \"%s\"\n", *argv); + fprintf(stderr, + "Invalid 6RD parameter \"%s\"\n", *argv); exit(-1); } argc--; argv++; @@ -570,8 +584,9 @@ static int do_6rd(int argc, char **argv) return tnl_6rd_ioctl(cmd, medium, &ip6rd); } -static int tunnel_mode_is_ipv6(char *tunnel_mode) { - static const char *ipv6_modes[] = { +static int tunnel_mode_is_ipv6(char *tunnel_mode) +{ + static const char * const ipv6_modes[] = { "ipv6/ipv6", "ip6ip6", "vti6", "ip/ipv6", "ipv4/ipv6", "ipip6", "ip4ip6", @@ -580,7 +595,7 @@ static int tunnel_mode_is_ipv6(char *tunnel_mode) { }; int i; - for (i = 0; i < sizeof(ipv6_modes) / sizeof(ipv6_modes[0]); i++) { + for (i = 0; i < ARRAY_SIZE(ipv6_modes); i++) { if (strcmp(ipv6_modes[i], tunnel_mode) == 0) return 1; } From 0f7543322c5fd64d70672578979cf74226f54b64 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 2 Dec 2015 13:50:22 +0100 Subject: [PATCH 102/151] route: ignore RTAX_HOPLIMIT of value -1 Older kernels use -1 internally as indicator to use the sysctl default, but they still export the setting. Newer kernels use 0 to indicate that (which is why the conversion from -1 to 0 was done here), but they also stopped exporting the value. Since the meaning of -1 is clear, treat it equally like default on newer kernels (which is to not print anything). Signed-off-by: Phil Sutter --- ip/iproute.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/ip/iproute.c b/ip/iproute.c index aed1038e..c42ea0b9 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -577,24 +577,23 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) if (mxrta[i] == NULL) continue; - if (i < sizeof(mx_names)/sizeof(char*) && mx_names[i]) - fprintf(fp, " %s", mx_names[i]); - else - fprintf(fp, " metric %d", i); - if (mxlock & (1< Date: Thu, 3 Dec 2015 17:13:48 +0100 Subject: [PATCH 103/151] libnetlink: don't confuse variables in rtnl_talk() There is two variables named 'len' in rtnl_talk. In fact, commit c079e121a73a didn't work. For example, it was possible to trigger a seg fault with this command: $ ip link set gre2 type ip6gre hoplimit 32 Let's rename the argument len to maxlen. Fixes: c079e121a73a ("libnetlink: add size argument to rtnl_talk") Reported-by: Thomas Faivre Signed-off-by: Nicolas Dichtel --- lib/libnetlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 922ec2d9..16582144 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -332,7 +332,7 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth, } int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, - struct nlmsghdr *answer, size_t len) + struct nlmsghdr *answer, size_t maxlen) { int status; unsigned seq; @@ -415,7 +415,7 @@ int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, } else if (!err->error) { if (answer) memcpy(answer, h, - MIN(len, h->nlmsg_len)); + MIN(maxlen, h->nlmsg_len)); return 0; } @@ -427,7 +427,7 @@ int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, if (answer) { memcpy(answer, h, - MIN(len, h->nlmsg_len)); + MIN(maxlen, h->nlmsg_len)); return 0; } From 8a23f820457b309fc30d093109ef1e6aa57842c1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 8 Dec 2015 12:24:44 -0800 Subject: [PATCH 104/151] vrf: Add support for table names Currently, the table id for VRF devices requires an integer. Convert it to use rtnl_rttable_a2n which handles table names from the iproute2 directory. This also fixes a bug in the original commit where table name are not properly handled. Fixes: 15faa0a30bed ("add support for VRF device") Signed-off-by: David Ahern --- ip/iplink_vrf.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/ip/iplink_vrf.c b/ip/iplink_vrf.c index 913a2892..9b4b7728 100644 --- a/ip/iplink_vrf.c +++ b/ip/iplink_vrf.c @@ -28,12 +28,6 @@ static void explain(void) vrf_explain(stderr); } -static int table_arg(void) -{ - fprintf(stderr,"Error: argument of \"table\" must be 0-32767 and currently unused\n"); - return -1; -} - static int vrf_parse_opt(struct link_util *lu, int argc, char **argv, struct nlmsghdr *n) { @@ -43,9 +37,8 @@ static int vrf_parse_opt(struct link_util *lu, int argc, char **argv, NEXT_ARG(); - table = atoi(*argv); - if (table > 32767) - return table_arg(); + if (rtnl_rttable_a2n(&table, *argv)) + invarg("invalid table ID\n", *argv); addattr32(n, 1024, IFLA_VRF_TABLE, table); } else if (matches(*argv, "help") == 0) { explain(); From b08b5ff128874f94a1bc9dd8e178fa0e57c11c61 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 10 Dec 2015 13:24:51 +0100 Subject: [PATCH 105/151] tc.8: Fix reference to tc-tcindex.8 Just a typo there, it's spelled correctly in SEE ALSO section.. Signed-off-by: Phil Sutter --- man/man8/tc.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/man8/tc.8 b/man/man8/tc.8 index 6275c4b3..4e99dcad 100644 --- a/man/man8/tc.8 +++ b/man/man8/tc.8 @@ -181,7 +181,7 @@ Match Resource Reservation Protocol (RSVP) packets. .TP tcindex Filter packets based on traffic control index. See -.BR tc-index (8). +.BR tc-tcindex (8). .TP u32 Generic filtering on arbitrary packet data, assisted by syntax to abstract common operations. See From 654ae881de57467642c8c2ed16ffc3a8d57fafa2 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Dec 2015 08:52:10 -0800 Subject: [PATCH 106/151] ip: fix format string when reading statistics The tunnel code was doing sscanf(buf, "%ld", &x) where x was unsigned long. --- ip/ip6tunnel.c | 2 +- ip/iptunnel.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c index 320d2539..1737d884 100644 --- a/ip/ip6tunnel.c +++ b/ip/ip6tunnel.c @@ -354,7 +354,7 @@ static int do_tunnels_list(struct ip6_tnl_parm2 *p) fprintf(stderr, "Wrong format for /proc/net/dev. Giving up.\n"); goto end; } - if (sscanf(ptr, "%ld%ld%ld%ld%ld%ld%ld%*d%ld%ld%ld%ld%ld%ld%ld", + if (sscanf(ptr, "%lu%lu%lu%lu%lu%lu%lu%*d%lu%lu%lu%lu%lu%lu%lu", &rx_bytes, &rx_packets, &rx_errs, &rx_drops, &rx_fifo, &rx_frame, &rx_multi, &tx_bytes, &tx_packets, &tx_errs, &tx_drops, diff --git a/ip/iptunnel.c b/ip/iptunnel.c index ce9ee320..a3ff99bd 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -419,7 +419,7 @@ static int do_tunnels_list(struct ip_tunnel_parm *p) fprintf(stderr, "Wrong format for /proc/net/dev. Giving up.\n"); goto end; } - if (sscanf(ptr, "%ld%ld%ld%ld%ld%ld%ld%*d%ld%ld%ld%ld%ld%ld%ld", + if (sscanf(ptr, "%lu%lu%lu%lu%lu%lu%lu%*d%lu%lu%lu%lu%lu%lu%lu", &rx_bytes, &rx_packets, &rx_errs, &rx_drops, &rx_fifo, &rx_frame, &rx_multi, &tx_bytes, &tx_packets, &tx_errs, &tx_drops, From 41d6e33fc9e5b459b7f715acbd6d8dbeddf58576 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 2 Dec 2015 00:25:36 +0100 Subject: [PATCH 107/151] examples, bpf: further improve examples Improve example files further and add a more generic set of possible helpers for them that can be used. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- examples/bpf/bpf_cyclic.c | 38 +++--- examples/bpf/bpf_funcs.h | 76 ------------ examples/bpf/bpf_graft.c | 39 +++---- examples/bpf/bpf_prog.c | 33 +++--- examples/bpf/bpf_shared.c | 32 +++-- examples/bpf/bpf_shared.h | 2 +- examples/bpf/bpf_tailcall.c | 78 +++++-------- include/bpf_api.h | 225 ++++++++++++++++++++++++++++++++++++ 8 files changed, 324 insertions(+), 199 deletions(-) delete mode 100644 examples/bpf/bpf_funcs.h create mode 100644 include/bpf_api.h diff --git a/examples/bpf/bpf_cyclic.c b/examples/bpf/bpf_cyclic.c index bde061cf..c66cbecc 100644 --- a/examples/bpf/bpf_cyclic.c +++ b/examples/bpf/bpf_cyclic.c @@ -1,32 +1,30 @@ -#include - -#include "bpf_funcs.h" +#include "../../include/bpf_api.h" /* Cyclic dependency example to test the kernel's runtime upper - * bound on loops. + * bound on loops. Also demonstrates on how to use direct-actions, + * loaded as: tc filter add [...] bpf da obj [...] */ -struct bpf_elf_map __section("maps") jmp_tc = { - .type = BPF_MAP_TYPE_PROG_ARRAY, - .id = 0xabccba, - .size_key = sizeof(int), - .size_value = sizeof(int), - .pinning = PIN_OBJECT_NS, - .max_elem = 1, -}; +#define JMP_MAP_ID 0xabccba -__section_tail(0xabccba, 0) int cls_loop(struct __sk_buff *skb) +BPF_PROG_ARRAY(jmp_tc, JMP_MAP_ID, PIN_OBJECT_NS, 1); + +__section_tail(JMP_MAP_ID, 0) +int cls_loop(struct __sk_buff *skb) { char fmt[] = "cb: %u\n"; - bpf_printk(fmt, sizeof(fmt), skb->cb[0]++); - bpf_tail_call(skb, &jmp_tc, 0); - return -1; + trace_printk(fmt, sizeof(fmt), skb->cb[0]++); + tail_call(skb, &jmp_tc, 0); + + skb->tc_classid = TC_H_MAKE(1, 42); + return TC_ACT_OK; } -__section("classifier") int cls_entry(struct __sk_buff *skb) +__section_cls_entry +int cls_entry(struct __sk_buff *skb) { - bpf_tail_call(skb, &jmp_tc, 0); - return -1; + tail_call(skb, &jmp_tc, 0); + return TC_ACT_SHOT; } -char __license[] __section("license") = "GPL"; +BPF_LICENSE("GPL"); diff --git a/examples/bpf/bpf_funcs.h b/examples/bpf/bpf_funcs.h deleted file mode 100644 index 6d058f0e..00000000 --- a/examples/bpf/bpf_funcs.h +++ /dev/null @@ -1,76 +0,0 @@ -#ifndef __BPF_FUNCS__ -#define __BPF_FUNCS__ - -#include - -#include "../../include/bpf_elf.h" - -/* Misc macros. */ -#ifndef __maybe_unused -# define __maybe_unused __attribute__ ((__unused__)) -#endif - -#ifndef __stringify -# define __stringify(x) #x -#endif - -#ifndef __section -# define __section(NAME) __attribute__((section(NAME), used)) -#endif - -#ifndef __section_tail -# define __section_tail(m, x) __section(__stringify(m) "/" __stringify(x)) -#endif - -#ifndef offsetof -# define offsetof __builtin_offsetof -#endif - -#ifndef htons -# define htons(x) __constant_htons((x)) -#endif - -#ifndef likely -# define likely(x) __builtin_expect(!!(x), 1) -#endif - -#ifndef unlikely -# define unlikely(x) __builtin_expect(!!(x), 0) -#endif - -/* The verifier will translate them to actual function calls. */ -static void *(*bpf_map_lookup_elem)(void *map, void *key) __maybe_unused = - (void *) BPF_FUNC_map_lookup_elem; - -static int (*bpf_map_update_elem)(void *map, void *key, void *value, - unsigned long long flags) __maybe_unused = - (void *) BPF_FUNC_map_update_elem; - -static int (*bpf_map_delete_elem)(void *map, void *key) __maybe_unused = - (void *) BPF_FUNC_map_delete_elem; - -static unsigned int (*get_smp_processor_id)(void) __maybe_unused = - (void *) BPF_FUNC_get_smp_processor_id; - -static unsigned int (*get_prandom_u32)(void) __maybe_unused = - (void *) BPF_FUNC_get_prandom_u32; - -static int (*bpf_printk)(const char *fmt, int fmt_size, ...) __maybe_unused = - (void *) BPF_FUNC_trace_printk; - -static void (*bpf_tail_call)(void *ctx, void *map, int index) __maybe_unused = - (void *) BPF_FUNC_tail_call; - -/* LLVM built-in functions that an eBPF C program may use to emit - * BPF_LD_ABS and BPF_LD_IND instructions. - */ -unsigned long long load_byte(void *skb, unsigned long long off) - asm ("llvm.bpf.load.byte"); - -unsigned long long load_half(void *skb, unsigned long long off) - asm ("llvm.bpf.load.half"); - -unsigned long long load_word(void *skb, unsigned long long off) - asm ("llvm.bpf.load.word"); - -#endif /* __BPF_FUNCS__ */ diff --git a/examples/bpf/bpf_graft.c b/examples/bpf/bpf_graft.c index f36d25a2..f48fd028 100644 --- a/examples/bpf/bpf_graft.c +++ b/examples/bpf/bpf_graft.c @@ -1,6 +1,4 @@ -#include - -#include "bpf_funcs.h" +#include "../../include/bpf_api.h" /* This example demonstrates how classifier run-time behaviour * can be altered with tail calls. We start out with an empty @@ -34,37 +32,36 @@ * Socket Thread-19818 [001] ..s. 139022.156730: : bbb * [...] */ -struct bpf_elf_map __section("maps") jmp_tc = { - .type = BPF_MAP_TYPE_PROG_ARRAY, - .size_key = sizeof(int), - .size_value = sizeof(int), - .pinning = PIN_GLOBAL_NS, - .max_elem = 1, -}; -__section("aaa") int cls_aaa(struct __sk_buff *skb) +BPF_PROG_ARRAY(jmp_tc, 0, PIN_GLOBAL_NS, 1); + +__section("aaa") +int cls_aaa(struct __sk_buff *skb) { char fmt[] = "aaa\n"; - bpf_printk(fmt, sizeof(fmt)); - return -1; + trace_printk(fmt, sizeof(fmt)); + return TC_H_MAKE(1, 42); } -__section("bbb") int cls_bbb(struct __sk_buff *skb) +__section("bbb") +int cls_bbb(struct __sk_buff *skb) { char fmt[] = "bbb\n"; - bpf_printk(fmt, sizeof(fmt)); - return -1; + trace_printk(fmt, sizeof(fmt)); + return TC_H_MAKE(1, 43); } -__section("classifier") int cls_entry(struct __sk_buff *skb) +__section_cls_entry +int cls_entry(struct __sk_buff *skb) { char fmt[] = "fallthrough\n"; - bpf_tail_call(skb, &jmp_tc, 0); - bpf_printk(fmt, sizeof(fmt)); - return -1; + tail_call(skb, &jmp_tc, 0); + trace_printk(fmt, sizeof(fmt)); + + return BPF_H_DEFAULT; } -char __license[] __section("license") = "GPL"; +BPF_LICENSE("GPL"); diff --git a/examples/bpf/bpf_prog.c b/examples/bpf/bpf_prog.c index 009febd0..47280492 100644 --- a/examples/bpf/bpf_prog.c +++ b/examples/bpf/bpf_prog.c @@ -168,8 +168,8 @@ /* Common, shared definitions with ebpf_agent.c. */ #include "bpf_shared.h" -/* Selection of BPF helper functions for our example. */ -#include "bpf_funcs.h" +/* BPF helper functions for our example. */ +#include "../../include/bpf_api.h" /* Could be defined here as well, or included from the header. */ #define TC_ACT_UNSPEC (-1) @@ -387,10 +387,10 @@ static inline void cls_update_proto_map(const struct __sk_buff *skb, uint8_t proto = flow->ip_proto; struct count_tuple *ct, _ct; - ct = bpf_map_lookup_elem(&map_proto, &proto); + ct = map_lookup_elem(&map_proto, &proto); if (likely(ct)) { - __sync_fetch_and_add(&ct->packets, 1); - __sync_fetch_and_add(&ct->bytes, skb->len); + lock_xadd(&ct->packets, 1); + lock_xadd(&ct->bytes, skb->len); return; } @@ -398,7 +398,7 @@ static inline void cls_update_proto_map(const struct __sk_buff *skb, _ct.packets = 1; _ct.bytes = skb->len; - bpf_map_update_elem(&map_proto, &proto, &_ct, BPF_ANY); + map_update_elem(&map_proto, &proto, &_ct, BPF_ANY); } static inline void cls_update_queue_map(const struct __sk_buff *skb) @@ -409,11 +409,11 @@ static inline void cls_update_queue_map(const struct __sk_buff *skb) mismatch = skb->queue_mapping != get_smp_processor_id(); - cq = bpf_map_lookup_elem(&map_queue, &queue); + cq = map_lookup_elem(&map_queue, &queue); if (likely(cq)) { - __sync_fetch_and_add(&cq->total, 1); + lock_xadd(&cq->total, 1); if (mismatch) - __sync_fetch_and_add(&cq->mismatch, 1); + lock_xadd(&cq->mismatch, 1); return; } @@ -421,7 +421,7 @@ static inline void cls_update_queue_map(const struct __sk_buff *skb) _cq.total = 1; _cq.mismatch = mismatch ? 1 : 0; - bpf_map_update_elem(&map_queue, &queue, &_cq, BPF_ANY); + map_update_elem(&map_queue, &queue, &_cq, BPF_ANY); } /* eBPF program definitions, placed in various sections, which can @@ -439,7 +439,8 @@ static inline void cls_update_queue_map(const struct __sk_buff *skb) * It is however not required to have multiple programs sharing * a file. */ -__section("classifier") int cls_main(struct __sk_buff *skb) +__section("classifier") +int cls_main(struct __sk_buff *skb) { struct flow_keys flow; @@ -456,13 +457,14 @@ static inline void act_update_drop_map(void) { uint32_t *count, cpu = get_smp_processor_id(); - count = bpf_map_lookup_elem(&map_drops, &cpu); + count = map_lookup_elem(&map_drops, &cpu); if (count) /* Only this cpu is accessing this element. */ (*count)++; } -__section("action-mark") int act_mark_main(struct __sk_buff *skb) +__section("action-mark") +int act_mark_main(struct __sk_buff *skb) { /* You could also mangle skb data here with the helper function * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could @@ -479,7 +481,8 @@ __section("action-mark") int act_mark_main(struct __sk_buff *skb) return TC_ACT_UNSPEC; } -__section("action-rand") int act_rand_main(struct __sk_buff *skb) +__section("action-rand") +int act_rand_main(struct __sk_buff *skb) { /* Sorry, we're near event horizon ... */ if ((get_prandom_u32() & 3) == 0) { @@ -493,4 +496,4 @@ __section("action-rand") int act_rand_main(struct __sk_buff *skb) /* Last but not least, the file contains a license. Some future helper * functions may only be available with a GPL license. */ -char __license[] __section("license") = "GPL"; +BPF_LICENSE("GPL"); diff --git a/examples/bpf/bpf_shared.c b/examples/bpf/bpf_shared.c index a8dc39c7..accc0adf 100644 --- a/examples/bpf/bpf_shared.c +++ b/examples/bpf/bpf_shared.c @@ -1,6 +1,4 @@ -#include - -#include "bpf_funcs.h" +#include "../../include/bpf_api.h" /* Minimal, stand-alone toy map pinning example: * @@ -20,35 +18,31 @@ * instance is being created. */ -struct bpf_elf_map __section("maps") map_sh = { - .type = BPF_MAP_TYPE_ARRAY, - .size_key = sizeof(int), - .size_value = sizeof(int), - .pinning = PIN_OBJECT_NS, /* or PIN_GLOBAL_NS, or PIN_NONE */ - .max_elem = 1, -}; +BPF_ARRAY4(map_sh, 0, PIN_OBJECT_NS, 1); /* or PIN_GLOBAL_NS, or PIN_NONE */ -__section("egress") int emain(struct __sk_buff *skb) +__section("egress") +int emain(struct __sk_buff *skb) { int key = 0, *val; - val = bpf_map_lookup_elem(&map_sh, &key); + val = map_lookup_elem(&map_sh, &key); if (val) - __sync_fetch_and_add(val, 1); + lock_xadd(val, 1); - return -1; + return BPF_H_DEFAULT; } -__section("ingress") int imain(struct __sk_buff *skb) +__section("ingress") +int imain(struct __sk_buff *skb) { char fmt[] = "map val: %d\n"; int key = 0, *val; - val = bpf_map_lookup_elem(&map_sh, &key); + val = map_lookup_elem(&map_sh, &key); if (val) - bpf_printk(fmt, sizeof(fmt), *val); + trace_printk(fmt, sizeof(fmt), *val); - return -1; + return BPF_H_DEFAULT; } -char __license[] __section("license") = "GPL"; +BPF_LICENSE("GPL"); diff --git a/examples/bpf/bpf_shared.h b/examples/bpf/bpf_shared.h index ea8f0147..a24038dd 100644 --- a/examples/bpf/bpf_shared.h +++ b/examples/bpf/bpf_shared.h @@ -10,7 +10,7 @@ enum { }; struct count_tuple { - long packets; /* type long for __sync_fetch_and_add() */ + long packets; /* type long for lock_xadd() */ long bytes; }; diff --git a/examples/bpf/bpf_tailcall.c b/examples/bpf/bpf_tailcall.c index f186e575..040790d0 100644 --- a/examples/bpf/bpf_tailcall.c +++ b/examples/bpf/bpf_tailcall.c @@ -1,6 +1,4 @@ -#include - -#include "bpf_funcs.h" +#include "../../include/bpf_api.h" #define ENTRY_INIT 3 #define ENTRY_0 0 @@ -27,89 +25,75 @@ * program array can be atomically replaced during run-time, e.g. to change * classifier behaviour. */ -struct bpf_elf_map __section("maps") map_sh = { - .type = BPF_MAP_TYPE_ARRAY, - .size_key = sizeof(int), - .size_value = sizeof(int), - .pinning = PIN_OBJECT_NS, - .max_elem = 1, -}; -struct bpf_elf_map __section("maps") jmp_tc = { - .type = BPF_MAP_TYPE_PROG_ARRAY, - .id = FOO, - .size_key = sizeof(int), - .size_value = sizeof(int), - .pinning = PIN_OBJECT_NS, - .max_elem = MAX_JMP_SIZE, -}; +BPF_PROG_ARRAY(jmp_tc, FOO, PIN_OBJECT_NS, MAX_JMP_SIZE); +BPF_PROG_ARRAY(jmp_ex, BAR, PIN_OBJECT_NS, 1); -struct bpf_elf_map __section("maps") jmp_ex = { - .type = BPF_MAP_TYPE_PROG_ARRAY, - .id = BAR, - .size_key = sizeof(int), - .size_value = sizeof(int), - .pinning = PIN_OBJECT_NS, - .max_elem = 1, -}; +BPF_ARRAY4(map_sh, 0, PIN_OBJECT_NS, 1); -__section_tail(FOO, ENTRY_0) int cls_case1(struct __sk_buff *skb) +__section_tail(FOO, ENTRY_0) +int cls_case1(struct __sk_buff *skb) { char fmt[] = "case1: map-val: %d from:%u\n"; int key = 0, *val; - val = bpf_map_lookup_elem(&map_sh, &key); + val = map_lookup_elem(&map_sh, &key); if (val) - bpf_printk(fmt, sizeof(fmt), *val, skb->cb[0]); + trace_printk(fmt, sizeof(fmt), *val, skb->cb[0]); skb->cb[0] = ENTRY_0; - bpf_tail_call(skb, &jmp_ex, ENTRY_0); - return 0; + tail_call(skb, &jmp_ex, ENTRY_0); + + return BPF_H_DEFAULT; } -__section_tail(FOO, ENTRY_1) int cls_case2(struct __sk_buff *skb) +__section_tail(FOO, ENTRY_1) +int cls_case2(struct __sk_buff *skb) { char fmt[] = "case2: map-val: %d from:%u\n"; int key = 0, *val; - val = bpf_map_lookup_elem(&map_sh, &key); + val = map_lookup_elem(&map_sh, &key); if (val) - bpf_printk(fmt, sizeof(fmt), *val, skb->cb[0]); + trace_printk(fmt, sizeof(fmt), *val, skb->cb[0]); skb->cb[0] = ENTRY_1; - bpf_tail_call(skb, &jmp_tc, ENTRY_0); - return 0; + tail_call(skb, &jmp_tc, ENTRY_0); + + return BPF_H_DEFAULT; } -__section_tail(BAR, ENTRY_0) int cls_exit(struct __sk_buff *skb) +__section_tail(BAR, ENTRY_0) +int cls_exit(struct __sk_buff *skb) { char fmt[] = "exit: map-val: %d from:%u\n"; int key = 0, *val; - val = bpf_map_lookup_elem(&map_sh, &key); + val = map_lookup_elem(&map_sh, &key); if (val) - bpf_printk(fmt, sizeof(fmt), *val, skb->cb[0]); + trace_printk(fmt, sizeof(fmt), *val, skb->cb[0]); /* Termination point. */ - return -1; + return BPF_H_DEFAULT; } -__section("classifier") int cls_entry(struct __sk_buff *skb) +__section_cls_entry +int cls_entry(struct __sk_buff *skb) { char fmt[] = "fallthrough\n"; int key = 0, *val; /* For transferring state, we can use skb->cb[0] ... skb->cb[4]. */ - val = bpf_map_lookup_elem(&map_sh, &key); + val = map_lookup_elem(&map_sh, &key); if (val) { - __sync_fetch_and_add(val, 1); + lock_xadd(val, 1); skb->cb[0] = ENTRY_INIT; - bpf_tail_call(skb, &jmp_tc, skb->hash & (MAX_JMP_SIZE - 1)); + tail_call(skb, &jmp_tc, skb->hash & (MAX_JMP_SIZE - 1)); } - bpf_printk(fmt, sizeof(fmt)); - return 0; + trace_printk(fmt, sizeof(fmt)); + return BPF_H_DEFAULT; } -char __license[] __section("license") = "GPL"; +BPF_LICENSE("GPL"); diff --git a/include/bpf_api.h b/include/bpf_api.h new file mode 100644 index 00000000..8503b9a5 --- /dev/null +++ b/include/bpf_api.h @@ -0,0 +1,225 @@ +#ifndef __BPF_API__ +#define __BPF_API__ + +/* Note: + * + * This file can be included into eBPF kernel programs. It contains + * a couple of useful helper functions, map/section ABI (bpf_elf.h), + * misc macros and some eBPF specific LLVM built-ins. + */ + +#include + +#include +#include +#include + +#include + +#include "bpf_elf.h" + +/** Misc macros. */ + +#ifndef __stringify +# define __stringify(X) #X +#endif + +#ifndef __maybe_unused +# define __maybe_unused __attribute__((__unused__)) +#endif + +#ifndef offsetof +# define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER) +#endif + +#ifndef likely +# define likely(X) __builtin_expect(!!(X), 1) +#endif + +#ifndef unlikely +# define unlikely(X) __builtin_expect(!!(X), 0) +#endif + +#ifndef htons +# define htons(X) __constant_htons((X)) +#endif + +#ifndef ntohs +# define ntohs(X) __constant_ntohs((X)) +#endif + +#ifndef htonl +# define htonl(X) __constant_htonl((X)) +#endif + +#ifndef ntohl +# define ntohl(X) __constant_ntohl((X) +#endif + +/** Section helper macros. */ + +#ifndef __section +# define __section(NAME) \ + __attribute__((section(NAME), used)) +#endif + +#ifndef __section_tail +# define __section_tail(ID, KEY) \ + __section(__stringify(ID) "/" __stringify(KEY)) +#endif + +#ifndef __section_cls_entry +# define __section_cls_entry \ + __section(ELF_SECTION_CLASSIFIER) +#endif + +#ifndef __section_act_entry +# define __section_act_entry \ + __section(ELF_SECTION_ACTION) +#endif + +#ifndef __section_license +# define __section_license \ + __section(ELF_SECTION_LICENSE) +#endif + +#ifndef __section_maps +# define __section_maps \ + __section(ELF_SECTION_MAPS) +#endif + +/** Declaration helper macros. */ + +#ifndef BPF_LICENSE +# define BPF_LICENSE(NAME) \ + char ____license[] __section_license = NAME +#endif + +#ifndef __BPF_MAP +# define __BPF_MAP(NAME, TYPE, ID, SIZE_KEY, SIZE_VALUE, PIN, MAX_ELEM) \ + struct bpf_elf_map __section_maps NAME = { \ + .type = (TYPE), \ + .id = (ID), \ + .size_key = (SIZE_KEY), \ + .size_value = (SIZE_VALUE), \ + .pinning = (PIN), \ + .max_elem = (MAX_ELEM), \ + } +#endif + +#ifndef BPF_HASH +# define BPF_HASH(NAME, ID, SIZE_KEY, SIZE_VALUE, PIN, MAX_ELEM) \ + __BPF_MAP(NAME, BPF_MAP_TYPE_HASH, ID, SIZE_KEY, SIZE_VALUE, \ + PIN, MAX_ELEM) +#endif + +#ifndef BPF_ARRAY +# define BPF_ARRAY(NAME, ID, SIZE_VALUE, PIN, MAX_ELEM) \ + __BPF_MAP(NAME, BPF_MAP_TYPE_ARRAY, ID, sizeof(uint32_t), \ + SIZE_VALUE, PIN, MAX_ELEM) +#endif + +#ifndef BPF_ARRAY2 +# define BPF_ARRAY2(NAME, ID, PIN, MAX_ELEM) \ + BPF_ARRAY(NAME, ID, sizeof(uint16_t), PIN, MAX_ELEM) +#endif + +#ifndef BPF_ARRAY4 +# define BPF_ARRAY4(NAME, ID, PIN, MAX_ELEM) \ + BPF_ARRAY(NAME, ID, sizeof(uint32_t), PIN, MAX_ELEM) +#endif + +#ifndef BPF_ARRAY8 +# define BPF_ARRAY8(NAME, ID, PIN, MAX_ELEM) \ + BPF_ARRAY(NAME, ID, sizeof(uint64_t), PIN, MAX_ELEM) +#endif + +#ifndef BPF_PROG_ARRAY +# define BPF_PROG_ARRAY(NAME, ID, PIN, MAX_ELEM) \ + __BPF_MAP(NAME, BPF_MAP_TYPE_PROG_ARRAY, ID, sizeof(uint32_t), \ + sizeof(uint32_t), PIN, MAX_ELEM) +#endif + +/** Classifier helper */ + +#ifndef BPF_H_DEFAULT +# define BPF_H_DEFAULT -1 +#endif + +/** BPF helper functions for tc. */ + +#ifndef BPF_FUNC +# define BPF_FUNC(NAME, ...) \ + (* NAME)(__VA_ARGS__) __maybe_unused = (void *) BPF_FUNC_##NAME +#endif + +/* Map access/manipulation */ +static void *BPF_FUNC(map_lookup_elem, void *map, const void *key); +static int BPF_FUNC(map_update_elem, void *map, const void *key, + const void *value, uint32_t flags); +static int BPF_FUNC(map_delete_elem, void *map, const void *key); + +/* Time access */ +static uint64_t BPF_FUNC(ktime_get_ns); + +/* Debugging */ +static void BPF_FUNC(trace_printk, const char *fmt, int fmt_size, ...); + +/* Random numbers */ +static uint32_t BPF_FUNC(get_prandom_u32); + +/* Tail calls */ +static void BPF_FUNC(tail_call, struct __sk_buff *skb, void *map, + uint32_t index); + +/* System helpers */ +static uint32_t BPF_FUNC(get_smp_processor_id); + +/* Packet misc meta data */ +static uint32_t BPF_FUNC(get_cgroup_classid, struct __sk_buff *skb); +static uint32_t BPF_FUNC(get_route_realm, struct __sk_buff *skb); + +/* Packet redirection */ +static int BPF_FUNC(redirect, int ifindex, uint32_t flags); +static int BPF_FUNC(clone_redirect, struct __sk_buff *skb, int ifindex, + uint32_t flags); + +/* Packet manipulation */ +#define BPF_PSEUDO_HDR 0x10 +#define BPF_HAS_PSEUDO_HDR(flags) ((flags) & BPF_PSEUDO_HDR) +#define BPF_HDR_FIELD_SIZE(flags) ((flags) & 0x0f) + +static int BPF_FUNC(skb_store_bytes, struct __sk_buff *skb, uint32_t off, + void *from, uint32_t len, uint32_t flags); +static int BPF_FUNC(l3_csum_replace, struct __sk_buff *skb, uint32_t off, + uint32_t from, uint32_t to, uint32_t flags); +static int BPF_FUNC(l4_csum_replace, struct __sk_buff *skb, uint32_t off, + uint32_t from, uint32_t to, uint32_t flags); + +/* Packet vlan encap/decap */ +static int BPF_FUNC(skb_vlan_push, struct __sk_buff *skb, uint16_t proto, + uint16_t vlan_tci); +static int BPF_FUNC(skb_vlan_pop, struct __sk_buff *skb); + +/* Packet tunnel encap/decap */ +static int BPF_FUNC(skb_get_tunnel_key, struct __sk_buff *skb, + struct bpf_tunnel_key *to, uint32_t size, uint32_t flags); +static int BPF_FUNC(skb_set_tunnel_key, struct __sk_buff *skb, + struct bpf_tunnel_key *from, uint32_t size, uint32_t flags); + +/** LLVM built-ins */ + +#ifndef lock_xadd +# define lock_xadd(ptr, val) ((void) __sync_fetch_and_add(ptr, val)) +#endif + +unsigned long long load_byte(void *skb, unsigned long long off) + asm ("llvm.bpf.load.byte"); + +unsigned long long load_half(void *skb, unsigned long long off) + asm ("llvm.bpf.load.half"); + +unsigned long long load_word(void *skb, unsigned long long off) + asm ("llvm.bpf.load.word"); + +#endif /* __BPF_API__ */ From 5866bddd9aa9eba57623d57f866afaee9a5e2597 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 30 Nov 2015 14:57:28 -0800 Subject: [PATCH 108/151] ila: Add support for ILA lwtunnels This patch: - Adds a utility function for parsing a 64 bit address - Adds a utility function for converting a 64 bit address to ASCII - Adds and ILA encap type in lwt tunnels Signed-off-by: Tom Herbert --- include/linux/ila.h | 15 +++++++++++ include/utils.h | 4 +++ ip/iproute_lwtunnel.c | 45 +++++++++++++++++++++++++++++++++ lib/utils.c | 59 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+) create mode 100644 include/linux/ila.h diff --git a/include/linux/ila.h b/include/linux/ila.h new file mode 100644 index 00000000..f08e8d77 --- /dev/null +++ b/include/linux/ila.h @@ -0,0 +1,15 @@ +/* ila.h - ILA Interface */ + +#ifndef _LINUX_ILA_H +#define _LINUX_ILA_H + +enum { + ILA_ATTR_UNSPEC, + ILA_ATTR_LOCATOR, /* u64 */ + + __ILA_ATTR_MAX, +}; + +#define ILA_ATTR_MAX (__ILA_ATTR_MAX - 1) + +#endif /* _LINUX_ILA_H */ diff --git a/include/utils.h b/include/utils.h index 1d351490..cc821e80 100644 --- a/include/utils.h +++ b/include/utils.h @@ -93,6 +93,7 @@ int get_prefix_1(inet_prefix *dst, char *arg, int family); int get_addr(inet_prefix *dst, const char *arg, int family); int get_prefix(inet_prefix *dst, char *arg, int family); int mask2bits(__u32 netmask); +int get_addr_ila(__u64 *val, const char *arg); int get_integer(int *val, const char *arg, int base); int get_unsigned(unsigned *val, const char *arg, int base); @@ -107,9 +108,12 @@ int get_u16(__u16 *val, const char *arg, int base); int get_s16(__s16 *val, const char *arg, int base); int get_u8(__u8 *val, const char *arg, int base); int get_s8(__s8 *val, const char *arg, int base); +int get_addr64(__u64 *ap, const char *cp); char* hexstring_n2a(const __u8 *str, int len, char *buf, int blen); __u8* hexstring_a2n(const char *str, __u8 *buf, int blen); +#define ADDR64_BUF_SIZE sizeof("xxxx:xxxx:xxxx:xxxx") +int addr64_n2a(__u64 addr, char *buff, size_t len); int af_bit_len(int af); int af_byte_len(int af); diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c index 63322a18..b2764a6c 100644 --- a/ip/iproute_lwtunnel.c +++ b/ip/iproute_lwtunnel.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,8 @@ static int read_encap_type(const char *name) return LWTUNNEL_ENCAP_IP; else if (strcmp(name, "ip6") == 0) return LWTUNNEL_ENCAP_IP6; + else if (strcmp(name, "ila") == 0) + return LWTUNNEL_ENCAP_ILA; else return LWTUNNEL_ENCAP_NONE; } @@ -45,6 +48,8 @@ static const char *format_encap_type(int type) return "ip"; case LWTUNNEL_ENCAP_IP6: return "ip6"; + case LWTUNNEL_ENCAP_ILA: + return "ila"; default: return "unknown"; } @@ -95,6 +100,21 @@ static void print_encap_ip(FILE *fp, struct rtattr *encap) fprintf(fp, "tos %d ", rta_getattr_u8(tb[LWTUNNEL_IP_TOS])); } +static void print_encap_ila(FILE *fp, struct rtattr *encap) +{ + struct rtattr *tb[ILA_ATTR_MAX+1]; + + parse_rtattr_nested(tb, ILA_ATTR_MAX, encap); + + if (tb[ILA_ATTR_LOCATOR]) { + char abuf[ADDR64_BUF_SIZE]; + + addr64_n2a(*(__u64 *)RTA_DATA(tb[ILA_ATTR_LOCATOR]), + abuf, sizeof(abuf)); + fprintf(fp, " %s ", abuf); + } +} + void lwt_print_encap(FILE *fp, struct rtattr *encap_type, struct rtattr *encap) { @@ -114,6 +134,9 @@ void lwt_print_encap(FILE *fp, struct rtattr *encap_type, case LWTUNNEL_ENCAP_IP: print_encap_ip(fp, encap); break; + case LWTUNNEL_ENCAP_ILA: + print_encap_ila(fp, encap); + break; } } @@ -186,6 +209,25 @@ static int parse_encap_ip(struct rtattr *rta, size_t len, int *argcp, char ***ar return 0; } +static int parse_encap_ila(struct rtattr *rta, size_t len, + int *argcp, char ***argvp) +{ + __u64 locator; + int argc = *argcp; + char **argv = *argvp; + + if (get_addr64(&locator, *argv) < 0) { + fprintf(stderr, "Bad locator: %s\n", *argv); + exit(1); + } + + rta_addattr64(rta, 1024, ILA_ATTR_LOCATOR, locator); + + *argcp = argc; + *argvp = argv; + + return 0; +} int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp) { @@ -213,6 +255,9 @@ int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp) case LWTUNNEL_ENCAP_IP: parse_encap_ip(rta, len, &argc, &argv); break; + case LWTUNNEL_ENCAP_ILA: + parse_encap_ila(rta, len, &argc, &argv); + break; default: fprintf(stderr, "Error: unsupported encap type\n"); break; diff --git a/lib/utils.c b/lib/utils.c index 939a44f0..fa35f4d0 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -384,6 +384,41 @@ static int get_addr_ipv4(__u8 *ap, const char *cp) return 1; } +int get_addr64(__u64 *ap, const char *cp) +{ + int i; + + union { + __u16 v16[4]; + __u64 v64; + } val; + + for (i = 0; i < 4; i++) { + unsigned long n; + char *endp; + + n = strtoul(cp, &endp, 16); + if (n > 0xffff) + return -1; /* bogus network value */ + + if (endp == cp) /* no digits */ + return -1; + + val.v16[i] = htons(n); + + if (*endp == '\0') + break; + + if (i == 3 || *endp != ':') + return -1; /* extra characters */ + cp = endp + 1; + } + + *ap = val.v64; + + return 1; +} + int get_addr_1(inet_prefix *addr, const char *name, int family) { memset(addr, 0, sizeof(*addr)); @@ -838,6 +873,30 @@ __u8* hexstring_a2n(const char *str, __u8 *buf, int blen) return buf; } +int addr64_n2a(__u64 addr, char *buff, size_t len) +{ + __u16 *words = (__u16 *)&addr; + __u16 v; + int i, ret; + size_t written = 0; + char *sep = ":"; + + for (i = 0; i < 4; i++) { + v = ntohs(words[i]); + + if (i == 3) + sep = ""; + + ret = snprintf(&buff[written], len - written, "%x%s", v, sep); + if (ret < 0) + return ret; + + written += ret; + } + + return written; +} + int print_timestamp(FILE *fp) { struct timeval tv; From e834eb8ebaf7b0ceb504f3dc4adf5c99f38ee93c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 1 Dec 2015 01:17:06 +0300 Subject: [PATCH 109/151] ip neigh: device is optional for proxy entries Though dumping such entries crashes present kernels. Signed-off-by: Konstantin Khlebnikov --- ip/ipneigh.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ip/ipneigh.c b/ip/ipneigh.c index 54655842..92b7cd6f 100644 --- a/ip/ipneigh.c +++ b/ip/ipneigh.c @@ -100,8 +100,9 @@ static int ipneigh_modify(int cmd, int flags, int argc, char **argv) struct ndmsg ndm; char buf[256]; } req; - char *d = NULL; + char *dev = NULL; int dst_ok = 0; + int dev_ok = 0; int lladdr_ok = 0; char * lla = NULL; inet_prefix dst; @@ -135,10 +136,12 @@ static int ipneigh_modify(int cmd, int flags, int argc, char **argv) duparg("address", *argv); get_addr(&dst, *argv, preferred_family); dst_ok = 1; + dev_ok = 1; req.ndm.ndm_flags |= NTF_PROXY; } else if (strcmp(*argv, "dev") == 0) { NEXT_ARG(); - d = *argv; + dev = *argv; + dev_ok = 1; } else { if (strcmp(*argv, "to") == 0) { NEXT_ARG(); @@ -153,7 +156,7 @@ static int ipneigh_modify(int cmd, int flags, int argc, char **argv) } argc--; argv++; } - if (d == NULL || !dst_ok || dst.family == AF_UNSPEC) { + if (!dev_ok || !dst_ok || dst.family == AF_UNSPEC) { fprintf(stderr, "Device and destination are required arguments.\n"); exit(-1); } @@ -175,8 +178,8 @@ static int ipneigh_modify(int cmd, int flags, int argc, char **argv) ll_init_map(&rth); - if ((req.ndm.ndm_ifindex = ll_name_to_index(d)) == 0) { - fprintf(stderr, "Cannot find device \"%s\"\n", d); + if (dev && (req.ndm.ndm_ifindex = ll_name_to_index(dev)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", dev); return -1; } From ed6b8652f7d5470cac7fd763b4a47d07a3a0bfb6 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Sat, 12 Dec 2015 14:09:48 +0100 Subject: [PATCH 110/151] route: Fix printing of locked entries Commit 0f7543322c5fd ("route: ignore RTAX_HOPLIMIT of value -1") accidentally reordered fprintf statements. This patch restores the original ordering. Fixes: 0f7543322c5fd ("route: ignore RTAX_HOPLIMIT of value -1") Signed-off-by: Phil Sutter --- ip/iproute.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ip/iproute.c b/ip/iproute.c index c42ea0b9..4d86a596 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -577,8 +577,6 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) if (mxrta[i] == NULL) continue; - if (mxlock & (1< Date: Tue, 15 Dec 2015 12:18:04 +0100 Subject: [PATCH 111/151] lwtunnel: fix argument parsing Currently parse_encap_ip() does not update correctly argv/argc; if multiple lwtunnel arguments are provided, the parsing fails after the first one, i.e. ip route add 172.16.101.0/24 dev vxlan1 encap ip id 42 dst 192.168.255.1 fails with: Error: either "to" is duplicate, or "dst" is a garbage. This commit addresses the issue, stepping to next argument at each iteration of the parsing loop. Fixes: 1e5293056a02 ("lwtunnel: Add encapsulation support to ip route") Signed-off-by: Paolo Abeni --- ip/iproute_lwtunnel.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c index b2764a6c..1243977c 100644 --- a/ip/iproute_lwtunnel.c +++ b/ip/iproute_lwtunnel.c @@ -201,10 +201,14 @@ static int parse_encap_ip(struct rtattr *rta, size_t len, int *argcp, char ***ar } else { break; } + argc--; argv++; } - *argcp = argc; - *argvp = argv; + /* argv is currently the first unparsed argument, + * but the lwt_parse_encap() caller will move to the next, + * so step back */ + *argcp = argc + 1; + *argvp = argv - 1; return 0; } From 741c20b024f5f51bf194435fc4d79f34ae5c5481 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 17 Dec 2015 17:21:53 -0800 Subject: [PATCH 112/151] include: update kernel headers Current headers for net-next --- include/linux/if_link.h | 3 +++ include/linux/in6.h | 1 + include/linux/rtnetlink.h | 1 + include/linux/sock_diag.h | 1 + 4 files changed, 6 insertions(+) diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 5d206c71..c9ad487d 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -460,6 +460,9 @@ enum { IFLA_GENEVE_PORT, /* destination port */ IFLA_GENEVE_COLLECT_METADATA, IFLA_GENEVE_REMOTE6, + IFLA_GENEVE_UDP_CSUM, + IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + IFLA_GENEVE_UDP_ZERO_CSUM6_RX, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) diff --git a/include/linux/in6.h b/include/linux/in6.h index 994f4c22..aa5b66df 100644 --- a/include/linux/in6.h +++ b/include/linux/in6.h @@ -196,6 +196,7 @@ struct in6_flowlabel_req { #define IPV6_IPSEC_POLICY 34 #define IPV6_XFRM_POLICY 35 +#define IPV6_HDRINCL 36 #endif /* diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 18c543a5..6aaa2a3e 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -311,6 +311,7 @@ enum rtattr_type_t { RTA_PREF, RTA_ENCAP_TYPE, RTA_ENCAP, + RTA_EXPIRES, __RTA_MAX }; diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index 024e1f4c..dafcb891 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -4,6 +4,7 @@ #include #define SOCK_DIAG_BY_FAMILY 20 +#define SOCK_DESTROY 21 struct sock_diag_req { __u8 sdiag_family; From fd7f9c7fd11fa926bda2edc8bc492e7515753a32 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 14 Dec 2015 16:57:32 +0100 Subject: [PATCH 113/151] bpf: minor fix in api and bpf_dump_error() usage Fix a whitespace in bpf_dump_error() usage, and also a missing closing bracket in ntohl() macro for eBPF programs. Signed-off-by: Daniel Borkmann --- include/bpf_api.h | 2 +- tc/tc_bpf.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/bpf_api.h b/include/bpf_api.h index 8503b9a5..0666a312 100644 --- a/include/bpf_api.h +++ b/include/bpf_api.h @@ -53,7 +53,7 @@ #endif #ifndef ntohl -# define ntohl(X) __constant_ntohl((X) +# define ntohl(X) __constant_ntohl((X)) #endif /** Section helper macros. */ diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c index beb74be6..f9b2b007 100644 --- a/tc/tc_bpf.c +++ b/tc/tc_bpf.c @@ -1042,7 +1042,7 @@ static int bpf_prog_attach(const char *section, "license:\'%s\') %s%s (%d)!\n\n", section, prog->type, prog->size / sizeof(struct bpf_insn), - prog->license, fd < 0 ? "rejected :" : + prog->license, fd < 0 ? "rejected: " : "loaded", fd < 0 ? strerror(errno) : "", fd < 0 ? errno : fd); } From 5c5176ce4b014ddf12526ec08fc24f9eede18767 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 16 Dec 2015 10:52:36 +0100 Subject: [PATCH 114/151] iproute: print addrgenmode stable_secret and fallback otherwise Signed-off-by: Hannes Frederic Sowa --- ip/ipaddress.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index bc8359eb..a495a391 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -285,13 +285,20 @@ static void print_af_spec(FILE *fp, struct rtattr *af_spec_attr) parse_rtattr_nested(tb, IFLA_INET6_MAX, inet6_attr); if (tb[IFLA_INET6_ADDR_GEN_MODE]) { - switch (rta_getattr_u8(tb[IFLA_INET6_ADDR_GEN_MODE])) { + __u8 mode = rta_getattr_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); + switch (mode) { case IN6_ADDR_GEN_MODE_EUI64: fprintf(fp, "addrgenmode eui64 "); break; case IN6_ADDR_GEN_MODE_NONE: fprintf(fp, "addrgenmode none "); break; + case IN6_ADDR_GEN_MODE_STABLE_PRIVACY: + fprintf(fp, "addrgenmode stable_secret "); + break; + default: + fprintf(fp, "addrgenmode %#.2hhx ", mode); + break; } } } From e79c327edddba0f1f70528ab4cf8ce37227054a6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Dec 2015 10:50:36 +0100 Subject: [PATCH 115/151] vxlan: add support for collect metadata flag This patch add support for IFLA_VXLAN_COLLECT_METADATA via the 'external' keyword to the vxlan link. Also enforce mutual exclusion between 'vni' and 'external'. Signed-off-by: Paolo Abeni --- ip/iplink_vxlan.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c index db29bf03..aa4d5198 100644 --- a/ip/iplink_vxlan.c +++ b/ip/iplink_vxlan.c @@ -31,7 +31,7 @@ static void print_explain(FILE *f) fprintf(f, " [ ageing SECONDS ] [ maxaddress NUMBER ]\n"); fprintf(f, " [ [no]udpcsum ] [ [no]udp6zerocsumtx ] [ [no]udp6zerocsumrx ]\n"); fprintf(f, " [ [no]remcsumtx ] [ [no]remcsumrx ]\n"); - fprintf(f, " [ gbp ]\n"); + fprintf(f, " [ [no]external ] [ gbp ]\n"); fprintf(f, "\n"); fprintf(f, "Where: VNI := 0-16777215\n"); fprintf(f, " ADDR := { IP_ADDRESS | any }\n"); @@ -72,6 +72,7 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, __u8 udp6zerocsumrx = 0; __u8 remcsumtx = 0; __u8 remcsumrx = 0; + __u8 metadata = 0; __u8 gbp = 0; int dst_port_set = 0; struct ifla_vxlan_port_range range = { 0, 0 }; @@ -210,6 +211,10 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, remcsumrx = 1; } else if (!matches(*argv, "noremcsumrx")) { remcsumrx = 0; + } else if (!matches(*argv, "external")) { + metadata = 1; + } else if (!matches(*argv, "noexternal")) { + metadata = 0; } else if (!matches(*argv, "gbp")) { gbp = 1; } else if (matches(*argv, "help") == 0) { @@ -223,7 +228,12 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, argc--, argv++; } - if (!vni_set) { + if (metadata && vni_set) { + fprintf(stderr, "vxlan: both 'external' and vni cannot be specified\n"); + return -1; + } + + if (!metadata && !vni_set) { fprintf(stderr, "vxlan: missing virtual network identifier\n"); return -1; } @@ -272,6 +282,7 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, addattr8(n, 1024, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, udp6zerocsumrx); addattr8(n, 1024, IFLA_VXLAN_REMCSUM_TX, remcsumtx); addattr8(n, 1024, IFLA_VXLAN_REMCSUM_RX, remcsumrx); + addattr8(n, 1024, IFLA_VXLAN_COLLECT_METADATA, metadata); if (noage) addattr32(n, 1024, IFLA_VXLAN_AGEING, 0); @@ -428,6 +439,10 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) rta_getattr_u8(tb[IFLA_VXLAN_REMCSUM_RX])) fputs("remcsumrx ", f); + if (tb[IFLA_VXLAN_COLLECT_METADATA] && + rta_getattr_u8(tb[IFLA_VXLAN_COLLECT_METADATA])) + fputs("external ", f); + if (tb[IFLA_VXLAN_GBP]) fputs("gbp ", f); } From 926b39e1feffdacff52fe8b7eafe0ba3b8c9ff59 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Dec 2015 10:50:37 +0100 Subject: [PATCH 116/151] gre: add support for collect metadata flag This patch add support for IFLA_GRE_COLLECT_METADATA via the 'external' keyword to the gre link. Signed-off-by: Paolo Abeni --- ip/link_gre.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ip/link_gre.c b/ip/link_gre.c index 58f416ca..c85741f5 100644 --- a/ip/link_gre.c +++ b/ip/link_gre.c @@ -74,6 +74,7 @@ static int gre_parse_opt(struct link_util *lu, int argc, char **argv, __u16 encapflags = 0; __u16 encapsport = 0; __u16 encapdport = 0; + __u8 metadata = 0; if (!(n->nlmsg_flags & NLM_F_CREATE)) { memset(&req, 0, sizeof(req)); @@ -148,6 +149,9 @@ get_failed: encapsport = rta_getattr_u16(greinfo[IFLA_GRE_ENCAP_SPORT]); if (greinfo[IFLA_GRE_ENCAP_DPORT]) encapdport = rta_getattr_u16(greinfo[IFLA_GRE_ENCAP_DPORT]); + + if (greinfo[IFLA_GRE_COLLECT_METADATA]) + metadata = 1; } while (argc > 0) { @@ -291,6 +295,8 @@ get_failed: encapflags |= TUNNEL_ENCAP_FLAG_REMCSUM; } else if (strcmp(*argv, "noencap-remcsum") == 0) { encapflags |= ~TUNNEL_ENCAP_FLAG_REMCSUM; + } else if (strcmp(*argv, "external") == 0) { + metadata = 1; } else usage(); argc--; argv++; @@ -325,6 +331,8 @@ get_failed: addattr16(n, 1024, IFLA_GRE_ENCAP_FLAGS, encapflags); addattr16(n, 1024, IFLA_GRE_ENCAP_SPORT, htons(encapsport)); addattr16(n, 1024, IFLA_GRE_ENCAP_DPORT, htons(encapdport)); + if (metadata) + addattr_l(n, 1024, IFLA_GRE_COLLECT_METADATA, NULL, 0); return 0; } @@ -413,6 +421,9 @@ static void gre_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) if (oflags & GRE_CSUM) fputs("ocsum ", f); + if (tb[IFLA_GRE_COLLECT_METADATA]) + fputs("external ", f); + if (tb[IFLA_GRE_ENCAP_TYPE] && *(__u16 *)RTA_DATA(tb[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE) { __u16 type = rta_getattr_u16(tb[IFLA_GRE_ENCAP_TYPE]); From d95cdcf52b4c85c280e3a0aceff22238947d92c2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Dec 2015 10:50:38 +0100 Subject: [PATCH 117/151] lwtunnel: implement support for ip6 encap Currently ip6 encap support for lwtunnel is missing. This patch implement it, mostly duplicating the ipv4 parts. Also be sure to insert a space after the encap type, when showing lwtunnel, to avoid the tunnel type and the following argument being merged into a single word. Signed-off-by: Paolo Abeni --- ip/iproute_lwtunnel.c | 92 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c index 1243977c..70749063 100644 --- a/ip/iproute_lwtunnel.c +++ b/ip/iproute_lwtunnel.c @@ -115,6 +115,37 @@ static void print_encap_ila(FILE *fp, struct rtattr *encap) } } +static void print_encap_ip6(FILE *fp, struct rtattr *encap) +{ + struct rtattr *tb[LWTUNNEL_IP6_MAX+1]; + char abuf[256]; + + parse_rtattr_nested(tb, LWTUNNEL_IP6_MAX, encap); + + if (tb[LWTUNNEL_IP6_ID]) + fprintf(fp, "id %llu ", ntohll(rta_getattr_u64(tb[LWTUNNEL_IP6_ID]))); + + if (tb[LWTUNNEL_IP6_SRC]) + fprintf(fp, "src %s ", + rt_addr_n2a(AF_INET6, + RTA_PAYLOAD(tb[LWTUNNEL_IP6_SRC]), + RTA_DATA(tb[LWTUNNEL_IP6_SRC]), + abuf, sizeof(abuf))); + + if (tb[LWTUNNEL_IP6_DST]) + fprintf(fp, "dst %s ", + rt_addr_n2a(AF_INET6, + RTA_PAYLOAD(tb[LWTUNNEL_IP6_DST]), + RTA_DATA(tb[LWTUNNEL_IP6_DST]), + abuf, sizeof(abuf))); + + if (tb[LWTUNNEL_IP6_HOPLIMIT]) + fprintf(fp, "hoplimit %d ", rta_getattr_u8(tb[LWTUNNEL_IP6_HOPLIMIT])); + + if (tb[LWTUNNEL_IP6_TC]) + fprintf(fp, "tc %d ", rta_getattr_u8(tb[LWTUNNEL_IP6_TC])); +} + void lwt_print_encap(FILE *fp, struct rtattr *encap_type, struct rtattr *encap) { @@ -125,7 +156,7 @@ void lwt_print_encap(FILE *fp, struct rtattr *encap_type, et = rta_getattr_u16(encap_type); - fprintf(fp, " encap %s", format_encap_type(et)); + fprintf(fp, " encap %s ", format_encap_type(et)); switch (et) { case LWTUNNEL_ENCAP_MPLS: @@ -137,6 +168,9 @@ void lwt_print_encap(FILE *fp, struct rtattr *encap_type, case LWTUNNEL_ENCAP_ILA: print_encap_ila(fp, encap); break; + case LWTUNNEL_ENCAP_IP6: + print_encap_ip6(fp, encap); + break; } } @@ -233,6 +267,59 @@ static int parse_encap_ila(struct rtattr *rta, size_t len, return 0; } +static int parse_encap_ip6(struct rtattr *rta, size_t len, int *argcp, char ***argvp) +{ + int id_ok = 0, dst_ok = 0, tos_ok = 0, ttl_ok = 0; + char **argv = *argvp; + int argc = *argcp; + + while (argc > 0) { + if (strcmp(*argv, "id") == 0) { + __u64 id; + NEXT_ARG(); + if (id_ok++) + duparg2("id", *argv); + if (get_u64(&id, *argv, 0)) + invarg("\"id\" value is invalid\n", *argv); + rta_addattr64(rta, len, LWTUNNEL_IP6_ID, htonll(id)); + } else if (strcmp(*argv, "dst") == 0) { + inet_prefix addr; + NEXT_ARG(); + if (dst_ok++) + duparg2("dst", *argv); + get_addr(&addr, *argv, AF_INET6); + rta_addattr_l(rta, len, LWTUNNEL_IP6_DST, &addr.data, addr.bytelen); + } else if (strcmp(*argv, "tc") == 0) { + __u32 tc; + NEXT_ARG(); + if (tos_ok++) + duparg2("tc", *argv); + if (rtnl_dsfield_a2n(&tc, *argv)) + invarg("\"tc\" value is invalid\n", *argv); + rta_addattr8(rta, len, LWTUNNEL_IP6_TC, tc); + } else if (strcmp(*argv, "hoplimit") == 0) { + __u8 hoplimit; + NEXT_ARG(); + if (ttl_ok++) + duparg2("hoplimit", *argv); + if (get_u8(&hoplimit, *argv, 0)) + invarg("\"hoplimit\" value is invalid\n", *argv); + rta_addattr8(rta, len, LWTUNNEL_IP6_HOPLIMIT, hoplimit); + } else { + break; + } + argc--; argv++; + } + + /* argv is currently the first unparsed argument, + * but the lwt_parse_encap() caller will move to the next, + * so step back */ + *argcp = argc + 1; + *argvp = argv - 1; + + return 0; +} + int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp) { struct rtattr *nest; @@ -262,6 +349,9 @@ int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp) case LWTUNNEL_ENCAP_ILA: parse_encap_ila(rta, len, &argc, &argv); break; + case LWTUNNEL_ENCAP_IP6: + parse_encap_ip6(rta, len, &argc, &argv); + break; default: fprintf(stderr, "Error: unsupported encap type\n"); break; From 7d6aadcd0a1dc795d72e1ab311aee333c763fe71 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 18 Dec 2015 11:58:06 +0100 Subject: [PATCH 118/151] ip{,6}tunnel: have a shared stats parser/printer This has a slight side-effect of not aborting when /proc/net/dev is malformed, but OTOH stats are not parsed for uninteresting interfaces. Signed-off-by: Phil Sutter --- ip/ip6tunnel.c | 21 ++------------------- ip/iptunnel.c | 21 ++------------------- ip/tunnel.c | 28 ++++++++++++++++++++++++++++ ip/tunnel.h | 1 + 4 files changed, 33 insertions(+), 38 deletions(-) diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c index 1737d884..7a3cd046 100644 --- a/ip/ip6tunnel.c +++ b/ip/ip6tunnel.c @@ -341,10 +341,6 @@ static int do_tunnels_list(struct ip6_tnl_parm2 *p) while (fgets(buf, sizeof(buf), fp) != NULL) { char name[IFNAMSIZ]; int index, type; - unsigned long rx_bytes, rx_packets, rx_errs, rx_drops, - rx_fifo, rx_frame, - tx_bytes, tx_packets, tx_errs, tx_drops, - tx_fifo, tx_colls, tx_carrier, rx_multi; struct ip6_tnl_parm2 p1; char *ptr; @@ -354,12 +350,6 @@ static int do_tunnels_list(struct ip6_tnl_parm2 *p) fprintf(stderr, "Wrong format for /proc/net/dev. Giving up.\n"); goto end; } - if (sscanf(ptr, "%lu%lu%lu%lu%lu%lu%lu%*d%lu%lu%lu%lu%lu%lu%lu", - &rx_bytes, &rx_packets, &rx_errs, &rx_drops, - &rx_fifo, &rx_frame, &rx_multi, - &tx_bytes, &tx_packets, &tx_errs, &tx_drops, - &tx_fifo, &tx_colls, &tx_carrier) != 14) - continue; if (p->name[0] && strcmp(p->name, name)) continue; index = ll_name_to_index(name); @@ -385,15 +375,8 @@ static int do_tunnels_list(struct ip6_tnl_parm2 *p) if (!ip6_tnl_parm_match(p, &p1)) continue; print_tunnel(&p1); - if (show_stats) { - printf("%s", _SL_); - printf("RX: Packets Bytes Errors CsumErrs OutOfSeq Mcasts%s", _SL_); - printf(" %-10ld %-12ld %-6ld %-8ld %-8ld %-8ld%s", - rx_packets, rx_bytes, rx_errs, rx_frame, rx_fifo, rx_multi, _SL_); - printf("TX: Packets Bytes Errors DeadLoop NoRoute NoBufs%s", _SL_); - printf(" %-10ld %-12ld %-6ld %-8ld %-8ld %-6ld", - tx_packets, tx_bytes, tx_errs, tx_colls, tx_carrier, tx_drops); - } + if (show_stats) + tnl_print_stats(ptr); printf("\n"); } err = 0; diff --git a/ip/iptunnel.c b/ip/iptunnel.c index a3ff99bd..65a4e6e9 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -405,10 +405,6 @@ static int do_tunnels_list(struct ip_tunnel_parm *p) while (fgets(buf, sizeof(buf), fp) != NULL) { char name[IFNAMSIZ]; int index, type; - unsigned long rx_bytes, rx_packets, rx_errs, rx_drops, - rx_fifo, rx_frame, - tx_bytes, tx_packets, tx_errs, tx_drops, - tx_fifo, tx_colls, tx_carrier, rx_multi; struct ip_tunnel_parm p1; char *ptr; @@ -419,12 +415,6 @@ static int do_tunnels_list(struct ip_tunnel_parm *p) fprintf(stderr, "Wrong format for /proc/net/dev. Giving up.\n"); goto end; } - if (sscanf(ptr, "%lu%lu%lu%lu%lu%lu%lu%*d%lu%lu%lu%lu%lu%lu%lu", - &rx_bytes, &rx_packets, &rx_errs, &rx_drops, - &rx_fifo, &rx_frame, &rx_multi, - &tx_bytes, &tx_packets, &tx_errs, &tx_drops, - &tx_fifo, &tx_colls, &tx_carrier) != 14) - continue; if (p->name[0] && strcmp(p->name, name)) continue; index = ll_name_to_index(name); @@ -447,15 +437,8 @@ static int do_tunnels_list(struct ip_tunnel_parm *p) (p->i_key && p1.i_key != p->i_key)) continue; print_tunnel(&p1); - if (show_stats) { - printf("%s", _SL_); - printf("RX: Packets Bytes Errors CsumErrs OutOfSeq Mcasts%s", _SL_); - printf(" %-10ld %-12ld %-6ld %-8ld %-8ld %-8ld%s", - rx_packets, rx_bytes, rx_errs, rx_frame, rx_fifo, rx_multi, _SL_); - printf("TX: Packets Bytes Errors DeadLoop NoRoute NoBufs%s", _SL_); - printf(" %-10ld %-12ld %-6ld %-8ld %-8ld %-6ld", - tx_packets, tx_bytes, tx_errs, tx_colls, tx_carrier, tx_drops); - } + if (show_stats) + tnl_print_stats(ptr); printf("\n"); } err = 0; diff --git a/ip/tunnel.c b/ip/tunnel.c index 79f2201f..1dd80922 100644 --- a/ip/tunnel.c +++ b/ip/tunnel.c @@ -195,3 +195,31 @@ __be32 tnl_parse_key(const char *name, const char *key) } return htonl(uval); } + +/* tnl_print_stats - print tunnel statistics + * + * @buf - tunnel interface's line in /proc/net/dev, + * starting past the interface name and following colon + */ +void tnl_print_stats(const char *buf) +{ + unsigned long rx_bytes, rx_packets, rx_errs, rx_drops, + rx_fifo, rx_frame, + tx_bytes, tx_packets, tx_errs, tx_drops, + tx_fifo, tx_colls, tx_carrier, rx_multi; + + if (sscanf(ptr, "%lu%lu%lu%lu%lu%lu%lu%*d%lu%lu%lu%lu%lu%lu%lu", + &rx_bytes, &rx_packets, &rx_errs, &rx_drops, + &rx_fifo, &rx_frame, &rx_multi, + &tx_bytes, &tx_packets, &tx_errs, &tx_drops, + &tx_fifo, &tx_colls, &tx_carrier) != 14) + return; + + printf("%s", _SL_); + printf("RX: Packets Bytes Errors CsumErrs OutOfSeq Mcasts%s", _SL_); + printf(" %-10ld %-12ld %-6ld %-8ld %-8ld %-8ld%s", + rx_packets, rx_bytes, rx_errs, rx_frame, rx_fifo, rx_multi, _SL_); + printf("TX: Packets Bytes Errors DeadLoop NoRoute NoBufs%s", _SL_); + printf(" %-10ld %-12ld %-6ld %-8ld %-8ld %-6ld", + tx_packets, tx_bytes, tx_errs, tx_colls, tx_carrier, tx_drops); +} diff --git a/ip/tunnel.h b/ip/tunnel.h index 9fb4a186..9a03c0d7 100644 --- a/ip/tunnel.h +++ b/ip/tunnel.h @@ -32,5 +32,6 @@ int tnl_prl_ioctl(int cmd, const char *name, void *p); int tnl_6rd_ioctl(int cmd, const char *name, void *p); int tnl_ioctl_get_6rd(const char *name, void *p); __be32 tnl_parse_key(const char *name, const char *key); +void tnl_print_stats(const char *buf); #endif From f8fc1d101e74fc9f1f9ca57b5e494c2f7bc33bf7 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Mon, 21 Dec 2015 20:42:56 +0100 Subject: [PATCH 119/151] iptunnel: Fix compile error in ip/tunnel.c I repeatedly failed to get this right, so now I have to clean up my mess afterwards. Fixes: 7d6aadcd0a1dc ("ip{,6}tunnel: have a shared stats parser/printer") Signed-off-by: Phil Sutter --- ip/tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/tunnel.c b/ip/tunnel.c index 1dd80922..39f825ba 100644 --- a/ip/tunnel.c +++ b/ip/tunnel.c @@ -208,7 +208,7 @@ void tnl_print_stats(const char *buf) tx_bytes, tx_packets, tx_errs, tx_drops, tx_fifo, tx_colls, tx_carrier, rx_multi; - if (sscanf(ptr, "%lu%lu%lu%lu%lu%lu%lu%*d%lu%lu%lu%lu%lu%lu%lu", + if (sscanf(buf, "%lu%lu%lu%lu%lu%lu%lu%*d%lu%lu%lu%lu%lu%lu%lu", &rx_bytes, &rx_packets, &rx_errs, &rx_drops, &rx_fifo, &rx_frame, &rx_multi, &tx_bytes, &tx_packets, &tx_errs, &tx_drops, From 68eede2505005ea919e2fb43afc91bad0601faea Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 21 Dec 2015 16:29:36 +0800 Subject: [PATCH 120/151] route: allow routes to be configured with expire values Signed-off-by: Hangbin Liu --- ip/iproute.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ip/iproute.c b/ip/iproute.c index 4d86a596..d5e3ebe2 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -86,7 +86,7 @@ static void usage(void) fprintf(stderr, " [ ssthresh NUMBER ] [ realms REALM ] [ src ADDRESS ]\n"); fprintf(stderr, " [ rto_min TIME ] [ hoplimit NUMBER ] [ initrwnd NUMBER ]\n"); fprintf(stderr, " [ features FEATURES ] [ quickack BOOL ] [ congctl NAME ]\n"); - fprintf(stderr, " [ pref PREF ]\n"); + fprintf(stderr, " [ pref PREF ] [ expires TIME ]\n"); fprintf(stderr, "TYPE := [ unicast | local | broadcast | multicast | throw |\n"); fprintf(stderr, " unreachable | prohibit | blackhole | nat ]\n"); fprintf(stderr, "TABLE_ID := [ local | main | default | all | NUMBER ]\n"); @@ -829,6 +829,7 @@ static int iproute_modify(int cmd, unsigned flags, int argc, char **argv) int table_ok = 0; int raw = 0; int type_ok = 0; + static int hz; memset(&req, 0, sizeof(req)); @@ -899,6 +900,14 @@ static int iproute_modify(int cmd, unsigned flags, int argc, char **argv) if (rtnl_dsfield_a2n(&tos, *argv)) invarg("\"tos\" value is invalid\n", *argv); req.r.rtm_tos = tos; + } else if (strcmp(*argv, "expires") == 0 ) { + __u32 expires; + NEXT_ARG(); + if (get_u32(&expires, *argv, 0)) + invarg("\"expires\" value is invalid\n", *argv); + if (!hz) + hz = get_user_hz(); + addattr32(&req.n, sizeof(req), RTA_EXPIRES, expires*hz); } else if (matches(*argv, "metric") == 0 || matches(*argv, "priority") == 0 || strcmp(*argv, "preference") == 0) { From 966fe23a7ca85c553bb1a3cc5160f0a6b1409996 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 25 Dec 2015 11:12:15 +0800 Subject: [PATCH 121/151] iproute2: ip-route.8.in: Add missing '[' before 'pref' Signed-off-by: Hangbin Liu --- man/man8/ip-route.8.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in index 9934a1e8..743d62be 100644 --- a/man/man8/ip-route.8.in +++ b/man/man8/ip-route.8.in @@ -130,7 +130,7 @@ replace " } " .B quickack .IR BOOL " ] [ " .B congctl -.IR NAME " ]" +.IR NAME " ] [ " .B pref .IR PREF " ]" From 3fbe7ca847367d0f9c3861283767ae702c2a19ab Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 25 Dec 2015 11:12:16 +0800 Subject: [PATCH 122/151] iproute2: ip-route.8.in: Add expires option for ip route Signed-off-by: Hangbin Liu --- man/man8/ip-route.8.in | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in index 743d62be..c764bfc8 100644 --- a/man/man8/ip-route.8.in +++ b/man/man8/ip-route.8.in @@ -132,7 +132,9 @@ replace " } " .B congctl .IR NAME " ] [ " .B pref -.IR PREF " ]" +.IR PREF " ] [ " +.B expires +.IR TIME " ]" .ti -8 .IR TYPE " := [ " @@ -656,6 +658,12 @@ is a set of encapsulation attributes specific to the .in -8 .RE +.TP +.BI expires " TIME " "(4.4+ only)" +the route will be deleted after the expires time. +.B Only +support IPv6 at present. + .TP ip route delete delete route From b27f005b274fb0332dc7e88e2bc1344e11fba143 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 30 Dec 2015 17:17:45 -0800 Subject: [PATCH 123/151] genl: make string const Signed-off-by: Stephen Hemminger --- genl/genl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genl/genl.c b/genl/genl.c index 49b65960..e33fafdf 100644 --- a/genl/genl.c +++ b/genl/genl.c @@ -54,7 +54,7 @@ static int parse_nofopt(struct genl_util *f, int argc, char **argv) return 0; } -static struct genl_util *get_genl_kind(char *str) +static struct genl_util *get_genl_kind(const char *str) { void *dlh; char buf[256]; From e49b51d6631290bf2df0efd56aa511e3387216ea Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 30 Dec 2015 17:19:04 -0800 Subject: [PATCH 124/151] monitor: fix file handle leak In some cases passing file to monitor left file open. Signed-off-by: Stephen Hemminger --- ip/ipmonitor.c | 6 +++++- ip/xfrm_monitor.c | 6 +++++- tc/tc_monitor.c | 10 +++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/ip/ipmonitor.c b/ip/ipmonitor.c index 8bcf8822..99a237f4 100644 --- a/ip/ipmonitor.c +++ b/ip/ipmonitor.c @@ -284,12 +284,16 @@ int do_ipmonitor(int argc, char **argv) } if (file) { FILE *fp; + int err; + fp = fopen(file, "r"); if (fp == NULL) { perror("Cannot fopen"); exit(-1); } - return rtnl_from_file(fp, accept_msg, stdout); + err = rtnl_from_file(fp, accept_msg, stdout); + fclose(fp); + return err; } if (rtnl_open(&rth, groups) < 0) diff --git a/ip/xfrm_monitor.c b/ip/xfrm_monitor.c index 8b21efad..e6e991af 100644 --- a/ip/xfrm_monitor.c +++ b/ip/xfrm_monitor.c @@ -411,12 +411,16 @@ int do_xfrm_monitor(int argc, char **argv) if (file) { FILE *fp; + int err; + fp = fopen(file, "r"); if (fp == NULL) { perror("Cannot fopen"); exit(-1); } - return rtnl_from_file(fp, xfrm_accept_msg, (void*)stdout); + err = rtnl_from_file(fp, xfrm_accept_msg, stdout); + fclose(fp); + return err; } if (rtnl_open_byproto(&rth, groups, NETLINK_XFRM) < 0) diff --git a/tc/tc_monitor.c b/tc/tc_monitor.c index 097068e9..ebb94320 100644 --- a/tc/tc_monitor.c +++ b/tc/tc_monitor.c @@ -91,13 +91,17 @@ int do_tcmonitor(int argc, char **argv) } if (file) { - FILE *fp; - fp = fopen(file, "r"); + FILE *fp = fopen(file, "r"); + int ret; + if (fp == NULL) { perror("Cannot fopen"); exit(-1); } - return rtnl_from_file(fp, accept_tcmsg, (void*)stdout); + + ret = rtnl_from_file(fp, accept_tcmsg, stdout); + fclose(fp); + return ret; } if (rtnl_open(&rth, groups) < 0) From b90b773ca6aa1b1a39d76186d1a7639a13f5c916 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 30 Dec 2015 17:28:11 -0800 Subject: [PATCH 125/151] lnstat: fix error handling Error handling was silent and had leaks. Signed-off-by: Stephen Hemminger --- misc/lnstat_util.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/misc/lnstat_util.c b/misc/lnstat_util.c index 70a77c56..a2583665 100644 --- a/misc/lnstat_util.c +++ b/misc/lnstat_util.c @@ -172,8 +172,10 @@ static struct lnstat_file *alloc_and_open(const char *path, const char *file) /* allocate */ lf = malloc(sizeof(*lf)); - if (!lf) + if (!lf) { + fprintf(stderr, "out of memory\n"); return NULL; + } /* initialize */ memset(lf, 0, sizeof(*lf)); @@ -190,6 +192,7 @@ static struct lnstat_file *alloc_and_open(const char *path, const char *file) /* open */ lf->fp = fopen(lf->path, "r"); if (!lf->fp) { + perror(lf->path); free(lf); return NULL; } @@ -256,12 +259,16 @@ struct lnstat_file *lnstat_scan_dir(const char *path, const int num_req_files, continue; lf = alloc_and_open(path, de->d_name); - if (!lf) + if (!lf) { + closedir(dir); return NULL; + } /* fill in field structure */ - if (lnstat_scan_fields(lf) < 0) + if (lnstat_scan_fields(lf) < 0) { + closedir(dir); return NULL; + } /* prepend to global list */ lf->next = lnstat_files; From c13b6b097ab38b346271ce5ef802a0372dcbf78d Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 30 Dec 2015 18:06:12 -0800 Subject: [PATCH 126/151] add coverity model file Track any coverity overrides for this project. Signed-off-by: Stephen Hemminger --- lib/coverity_model.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 lib/coverity_model.c diff --git a/lib/coverity_model.c b/lib/coverity_model.c new file mode 100644 index 00000000..c8963020 --- /dev/null +++ b/lib/coverity_model.c @@ -0,0 +1,19 @@ +/* + * Coverity Scan model + * + * This is a modeling file for Coverity Scan. Modeling helps to avoid false + * positives. + * + * - A model file can't import any header files. + * - Therefore only some built-in primitives like int, char and void are + * available but not wchar_t, NULL etc. + * - Modeling doesn't need full structs and typedefs. Rudimentary structs + * and similar types are sufficient. + * - An uninitialized local pointer is not an error. It signifies that the + * variable could be either NULL or have some data. + * + * Coverity Scan doesn't pick up modifications automatically. The model file + * must be uploaded by an admin. + */ + + From 5cd1adba79d33644debd4ba498bb262c5bebcfba Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 3 Jan 2016 15:14:27 -0800 Subject: [PATCH 127/151] Update to current iptables headers Keep in sync with current iptables upstream --- include/ip6tables.h | 141 +------ include/iptables.h | 186 +-------- include/iptables/internal.h | 13 + include/libiptc/ipt_kernel_headers.h | 13 +- include/libiptc/libip6tc.h | 129 +++--- include/libiptc/libiptc.h | 128 +++--- include/libiptc/libxtc.h | 33 ++ include/libiptc/xtcshared.h | 20 + include/xtables.h | 567 +++++++++++++++++++++++++++ 9 files changed, 795 insertions(+), 435 deletions(-) create mode 100644 include/iptables/internal.h create mode 100644 include/libiptc/libxtc.h create mode 100644 include/libiptc/xtcshared.h create mode 100644 include/xtables.h diff --git a/include/ip6tables.h b/include/ip6tables.h index 1050593a..5f1c5b65 100644 --- a/include/ip6tables.h +++ b/include/ip6tables.h @@ -1,141 +1,20 @@ #ifndef _IP6TABLES_USER_H #define _IP6TABLES_USER_H -#include "iptables_common.h" -#include "libiptc/libip6tc.h" - -struct ip6tables_rule_match -{ - struct ip6tables_rule_match *next; - - struct ip6tables_match *match; -}; - -/* Include file for additions: new matches and targets. */ -struct ip6tables_match -{ - struct ip6tables_match *next; - - ip6t_chainlabel name; - - const char *version; - - /* Size of match data. */ - size_t size; - - /* Size of match data relevent for userspace comparison purposes */ - size_t userspacesize; - - /* Function which prints out usage message. */ - void (*help)(void); - - /* Initialize the match. */ - void (*init)(struct ip6t_entry_match *m, unsigned int *nfcache); - - /* Function which parses command options; returns true if it - ate an option */ - int (*parse)(int c, char **argv, int invert, unsigned int *flags, - const struct ip6t_entry *entry, - unsigned int *nfcache, - struct ip6t_entry_match **match); - - /* Final check; exit if not ok. */ - void (*final_check)(unsigned int flags); - - /* Prints out the match iff non-NULL: put space at end */ - void (*print)(const struct ip6t_ip6 *ip, - const struct ip6t_entry_match *match, int numeric); - - /* Saves the union ipt_matchinfo in parsable form to stdout. */ - void (*save)(const struct ip6t_ip6 *ip, - const struct ip6t_entry_match *match); - - /* Pointer to list of extra command-line options */ - const struct option *extra_opts; - - /* Ignore these men behind the curtain: */ - unsigned int option_offset; - struct ip6t_entry_match *m; - unsigned int mflags; -#ifdef NO_SHARED_LIBS - unsigned int loaded; /* simulate loading so options are merged properly */ -#endif -}; - -struct ip6tables_target -{ - struct ip6tables_target *next; - - ip6t_chainlabel name; - - const char *version; - - /* Size of target data. */ - size_t size; - - /* Size of target data relevent for userspace comparison purposes */ - size_t userspacesize; - - /* Function which prints out usage message. */ - void (*help)(void); - - /* Initialize the target. */ - void (*init)(struct ip6t_entry_target *t, unsigned int *nfcache); - - /* Function which parses command options; returns true if it - ate an option */ - int (*parse)(int c, char **argv, int invert, unsigned int *flags, - const struct ip6t_entry *entry, - struct ip6t_entry_target **target); - - /* Final check; exit if not ok. */ - void (*final_check)(unsigned int flags); - - /* Prints out the target iff non-NULL: put space at end */ - void (*print)(const struct ip6t_ip6 *ip, - const struct ip6t_entry_target *target, int numeric); - - /* Saves the targinfo in parsable form to stdout. */ - void (*save)(const struct ip6t_ip6 *ip, - const struct ip6t_entry_target *target); - - /* Pointer to list of extra command-line options */ - struct option *extra_opts; - - /* Ignore these men behind the curtain: */ - unsigned int option_offset; - struct ip6t_entry_target *t; - unsigned int tflags; - unsigned int used; -#ifdef NO_SHARED_LIBS - unsigned int loaded; /* simulate loading so options are merged properly */ -#endif -}; - -extern int line; +#include +#include +#include +#include /* Your shared library should call one of these. */ -extern void register_match6(struct ip6tables_match *me); -extern void register_target6(struct ip6tables_target *me); - extern int do_command6(int argc, char *argv[], char **table, - ip6tc_handle_t *handle); -/* Keeping track of external matches and targets: linked lists. */ -extern struct ip6tables_match *ip6tables_matches; -extern struct ip6tables_target *ip6tables_targets; + struct xtc_handle **handle, bool restore); -enum ip6t_tryload { - DONT_LOAD, - TRY_LOAD, - LOAD_MUST_SUCCEED -}; +extern int for_each_chain6(int (*fn)(const xt_chainlabel, int, struct xtc_handle *), int verbose, int builtinstoo, struct xtc_handle *handle); +extern int flush_entries6(const xt_chainlabel chain, int verbose, struct xtc_handle *handle); +extern int delete_chain6(const xt_chainlabel chain, int verbose, struct xtc_handle *handle); +void print_rule6(const struct ip6t_entry *e, struct xtc_handle *h, const char *chain, int counters); -extern struct ip6tables_target *find_target(const char *name, enum ip6t_tryload); -extern struct ip6tables_match *find_match(const char *name, enum ip6t_tryload, struct ip6tables_rule_match **match); - -extern int for_each_chain(int (*fn)(const ip6t_chainlabel, int, ip6tc_handle_t *), int verbose, int builtinstoo, ip6tc_handle_t *handle); -extern int flush_entries(const ip6t_chainlabel chain, int verbose, ip6tc_handle_t *handle); -extern int delete_chain(const ip6t_chainlabel chain, int verbose, ip6tc_handle_t *handle); -extern int ip6tables_insmod(const char *modname, const char *modprobe); +extern struct xtables_globals ip6tables_globals; #endif /*_IP6TABLES_USER_H*/ diff --git a/include/iptables.h b/include/iptables.h index f1e62e23..78c10abd 100644 --- a/include/iptables.h +++ b/include/iptables.h @@ -1,179 +1,25 @@ #ifndef _IPTABLES_USER_H #define _IPTABLES_USER_H -#include "iptables_common.h" -#include "libiptc/libiptc.h" - -#ifndef IPT_LIB_DIR -#define IPT_LIB_DIR "/usr/local/lib/iptables" -#endif - -#ifndef IPPROTO_SCTP -#define IPPROTO_SCTP 132 -#endif - -#ifndef IPT_SO_GET_REVISION_MATCH /* Old kernel source. */ -#define IPT_SO_GET_REVISION_MATCH (IPT_BASE_CTL + 2) -#define IPT_SO_GET_REVISION_TARGET (IPT_BASE_CTL + 3) - -struct ipt_get_revision -{ - char name[IPT_FUNCTION_MAXNAMELEN-1]; - - u_int8_t revision; -}; -#endif /* IPT_SO_GET_REVISION_MATCH Old kernel source */ - -struct iptables_rule_match -{ - struct iptables_rule_match *next; - - struct iptables_match *match; -}; - -/* Include file for additions: new matches and targets. */ -struct iptables_match -{ - struct iptables_match *next; - - ipt_chainlabel name; - - /* Revision of match (0 by default). */ - u_int8_t revision; - - const char *version; - - /* Size of match data. */ - size_t size; - - /* Size of match data relevent for userspace comparison purposes */ - size_t userspacesize; - - /* Function which prints out usage message. */ - void (*help)(void); - - /* Initialize the match. */ - void (*init)(struct ipt_entry_match *m, unsigned int *nfcache); - - /* Function which parses command options; returns true if it - ate an option */ - int (*parse)(int c, char **argv, int invert, unsigned int *flags, - const struct ipt_entry *entry, - unsigned int *nfcache, - struct ipt_entry_match **match); - - /* Final check; exit if not ok. */ - void (*final_check)(unsigned int flags); - - /* Prints out the match iff non-NULL: put space at end */ - void (*print)(const struct ipt_ip *ip, - const struct ipt_entry_match *match, int numeric); - - /* Saves the match info in parsable form to stdout. */ - void (*save)(const struct ipt_ip *ip, - const struct ipt_entry_match *match); - - /* Pointer to list of extra command-line options */ - const struct option *extra_opts; - - /* Ignore these men behind the curtain: */ - unsigned int option_offset; - struct ipt_entry_match *m; - unsigned int mflags; -#ifdef NO_SHARED_LIBS - unsigned int loaded; /* simulate loading so options are merged properly */ -#endif -}; - -struct iptables_target -{ - struct iptables_target *next; - - ipt_chainlabel name; - - /* Revision of target (0 by default). */ - u_int8_t revision; - - const char *version; - - /* Size of target data. */ - size_t size; - - /* Size of target data relevent for userspace comparison purposes */ - size_t userspacesize; - - /* Function which prints out usage message. */ - void (*help)(void); - - /* Initialize the target. */ - void (*init)(struct ipt_entry_target *t, unsigned int *nfcache); - - /* Function which parses command options; returns true if it - ate an option */ - int (*parse)(int c, char **argv, int invert, unsigned int *flags, - const struct ipt_entry *entry, - struct ipt_entry_target **target); - - /* Final check; exit if not ok. */ - void (*final_check)(unsigned int flags); - - /* Prints out the target iff non-NULL: put space at end */ - void (*print)(const struct ipt_ip *ip, - const struct ipt_entry_target *target, int numeric); - - /* Saves the targinfo in parsable form to stdout. */ - void (*save)(const struct ipt_ip *ip, - const struct ipt_entry_target *target); - - /* Pointer to list of extra command-line options */ - struct option *extra_opts; - - /* Ignore these men behind the curtain: */ - unsigned int option_offset; - struct ipt_entry_target *t; - unsigned int tflags; - unsigned int used; -#ifdef NO_SHARED_LIBS - unsigned int loaded; /* simulate loading so options are merged properly */ -#endif -}; - -extern int line; +#include +#include +#include +#include /* Your shared library should call one of these. */ -extern void register_match(struct iptables_match *me); -extern void register_target(struct iptables_target *me); -extern void xtables_register_target(struct iptables_target *me); -extern int build_st(struct iptables_target *target, struct ipt_entry_target *t); +extern int do_command4(int argc, char *argv[], char **table, + struct xtc_handle **handle, bool restore); +extern int delete_chain4(const xt_chainlabel chain, int verbose, + struct xtc_handle *handle); +extern int flush_entries4(const xt_chainlabel chain, int verbose, + struct xtc_handle *handle); +extern int for_each_chain4(int (*fn)(const xt_chainlabel, int, struct xtc_handle *), + int verbose, int builtinstoo, struct xtc_handle *handle); +extern void print_rule4(const struct ipt_entry *e, + struct xtc_handle *handle, const char *chain, int counters); -extern struct in_addr *dotted_to_addr(const char *dotted); -extern char *addr_to_dotted(const struct in_addr *addrp); -extern char *addr_to_anyname(const struct in_addr *addr); -extern char *mask_to_dotted(const struct in_addr *mask); +extern struct xtables_globals iptables_globals; -extern void parse_hostnetworkmask(const char *name, struct in_addr **addrpp, - struct in_addr *maskp, unsigned int *naddrs); -extern u_int16_t parse_protocol(const char *s); +extern struct xtables_globals xtables_globals; -extern int do_command(int argc, char *argv[], char **table, - iptc_handle_t *handle); -/* Keeping track of external matches and targets: linked lists. */ -extern struct iptables_match *iptables_matches; -extern struct iptables_target *iptables_targets; - -enum ipt_tryload { - DONT_LOAD, - TRY_LOAD, - LOAD_MUST_SUCCEED -}; - -extern struct iptables_target *find_target(const char *name, enum ipt_tryload); -extern struct iptables_match *find_match(const char *name, enum ipt_tryload, struct iptables_rule_match **match); - -extern int delete_chain(const ipt_chainlabel chain, int verbose, - iptc_handle_t *handle); -extern int flush_entries(const ipt_chainlabel chain, int verbose, - iptc_handle_t *handle); -extern int for_each_chain(int (*fn)(const ipt_chainlabel, int, iptc_handle_t *), - int verbose, int builtinstoo, iptc_handle_t *handle); #endif /*_IPTABLES_USER_H*/ diff --git a/include/iptables/internal.h b/include/iptables/internal.h new file mode 100644 index 00000000..62a8ecb9 --- /dev/null +++ b/include/iptables/internal.h @@ -0,0 +1,13 @@ +#ifndef IPTABLES_INTERNAL_H +#define IPTABLES_INTERNAL_H 1 + +#define IPTABLES_VERSION "1.6.0" + +/** + * Program's own name and version. + */ +extern const char *program_name, *program_version; + +extern int line; + +#endif /* IPTABLES_INTERNAL_H */ diff --git a/include/libiptc/ipt_kernel_headers.h b/include/libiptc/ipt_kernel_headers.h index 7e878284..a5963e94 100644 --- a/include/libiptc/ipt_kernel_headers.h +++ b/include/libiptc/ipt_kernel_headers.h @@ -5,22 +5,11 @@ #include -#if defined(__GLIBC__) && __GLIBC__ == 2 #include #include #include #include #include +#include #include -#else /* libc5 */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#endif #endif diff --git a/include/libiptc/libip6tc.h b/include/libiptc/libip6tc.h index 7a247c46..9aed80a0 100644 --- a/include/libiptc/libip6tc.h +++ b/include/libiptc/libip6tc.h @@ -2,153 +2,160 @@ #define _LIBIP6TC_H /* Library which manipulates firewall rules. Version 0.2. */ +#include #include -#include - -#ifndef IP6T_MIN_ALIGN -#define IP6T_MIN_ALIGN (__alignof__(struct ip6t_entry)) +#ifdef __cplusplus +# include +#else +# include /* INT_MAX in ip6_tables.h */ #endif -#define IP6T_ALIGN(s) (((s) + (IP6T_MIN_ALIGN-1)) & ~(IP6T_MIN_ALIGN-1)) +#include +#include -typedef char ip6t_chainlabel[32]; +#define ip6tc_handle xtc_handle +#define ip6t_chainlabel xt_chainlabel #define IP6TC_LABEL_ACCEPT "ACCEPT" #define IP6TC_LABEL_DROP "DROP" #define IP6TC_LABEL_QUEUE "QUEUE" #define IP6TC_LABEL_RETURN "RETURN" -/* Transparent handle type. */ -typedef struct ip6tc_handle *ip6tc_handle_t; - /* Does this chain exist? */ -int ip6tc_is_chain(const char *chain, const ip6tc_handle_t handle); +int ip6tc_is_chain(const char *chain, struct xtc_handle *const handle); /* Take a snapshot of the rules. Returns NULL on error. */ -ip6tc_handle_t ip6tc_init(const char *tablename); +struct xtc_handle *ip6tc_init(const char *tablename); /* Cleanup after ip6tc_init(). */ -void ip6tc_free(ip6tc_handle_t *h); +void ip6tc_free(struct xtc_handle *h); /* Iterator functions to run through the chains. Returns NULL at end. */ -const char *ip6tc_first_chain(ip6tc_handle_t *handle); -const char *ip6tc_next_chain(ip6tc_handle_t *handle); +const char *ip6tc_first_chain(struct xtc_handle *handle); +const char *ip6tc_next_chain(struct xtc_handle *handle); /* Get first rule in the given chain: NULL for empty chain. */ const struct ip6t_entry *ip6tc_first_rule(const char *chain, - ip6tc_handle_t *handle); + struct xtc_handle *handle); /* Returns NULL when rules run out. */ const struct ip6t_entry *ip6tc_next_rule(const struct ip6t_entry *prev, - ip6tc_handle_t *handle); + struct xtc_handle *handle); /* Returns a pointer to the target name of this position. */ const char *ip6tc_get_target(const struct ip6t_entry *e, - ip6tc_handle_t *handle); + struct xtc_handle *handle); /* Is this a built-in chain? */ -int ip6tc_builtin(const char *chain, const ip6tc_handle_t handle); +int ip6tc_builtin(const char *chain, struct xtc_handle *const handle); /* Get the policy of a given built-in chain */ const char *ip6tc_get_policy(const char *chain, - struct ip6t_counters *counters, - ip6tc_handle_t *handle); + struct xt_counters *counters, + struct xtc_handle *handle); /* These functions return TRUE for OK or 0 and set errno. If errno == 0, it means there was a version error (ie. upgrade libiptc). */ /* Rule numbers start at 1 for the first rule. */ /* Insert the entry `fw' in chain `chain' into position `rulenum'. */ -int ip6tc_insert_entry(const ip6t_chainlabel chain, +int ip6tc_insert_entry(const xt_chainlabel chain, const struct ip6t_entry *e, unsigned int rulenum, - ip6tc_handle_t *handle); + struct xtc_handle *handle); /* Atomically replace rule `rulenum' in `chain' with `fw'. */ -int ip6tc_replace_entry(const ip6t_chainlabel chain, +int ip6tc_replace_entry(const xt_chainlabel chain, const struct ip6t_entry *e, unsigned int rulenum, - ip6tc_handle_t *handle); + struct xtc_handle *handle); /* Append entry `fw' to chain `chain'. Equivalent to insert with rulenum = length of chain. */ -int ip6tc_append_entry(const ip6t_chainlabel chain, +int ip6tc_append_entry(const xt_chainlabel chain, const struct ip6t_entry *e, - ip6tc_handle_t *handle); + struct xtc_handle *handle); -/* Delete the first rule in `chain' which matches `fw'. */ -int ip6tc_delete_entry(const ip6t_chainlabel chain, +/* Check whether a matching rule exists */ +int ip6tc_check_entry(const xt_chainlabel chain, const struct ip6t_entry *origfw, unsigned char *matchmask, - ip6tc_handle_t *handle); + struct xtc_handle *handle); + +/* Delete the first rule in `chain' which matches `fw'. */ +int ip6tc_delete_entry(const xt_chainlabel chain, + const struct ip6t_entry *origfw, + unsigned char *matchmask, + struct xtc_handle *handle); /* Delete the rule in position `rulenum' in `chain'. */ -int ip6tc_delete_num_entry(const ip6t_chainlabel chain, +int ip6tc_delete_num_entry(const xt_chainlabel chain, unsigned int rulenum, - ip6tc_handle_t *handle); + struct xtc_handle *handle); /* Check the packet `fw' on chain `chain'. Returns the verdict, or NULL and sets errno. */ -const char *ip6tc_check_packet(const ip6t_chainlabel chain, +const char *ip6tc_check_packet(const xt_chainlabel chain, struct ip6t_entry *, - ip6tc_handle_t *handle); + struct xtc_handle *handle); /* Flushes the entries in the given chain (ie. empties chain). */ -int ip6tc_flush_entries(const ip6t_chainlabel chain, - ip6tc_handle_t *handle); +int ip6tc_flush_entries(const xt_chainlabel chain, + struct xtc_handle *handle); /* Zeroes the counters in a chain. */ -int ip6tc_zero_entries(const ip6t_chainlabel chain, - ip6tc_handle_t *handle); +int ip6tc_zero_entries(const xt_chainlabel chain, + struct xtc_handle *handle); /* Creates a new chain. */ -int ip6tc_create_chain(const ip6t_chainlabel chain, - ip6tc_handle_t *handle); +int ip6tc_create_chain(const xt_chainlabel chain, + struct xtc_handle *handle); /* Deletes a chain. */ -int ip6tc_delete_chain(const ip6t_chainlabel chain, - ip6tc_handle_t *handle); +int ip6tc_delete_chain(const xt_chainlabel chain, + struct xtc_handle *handle); /* Renames a chain. */ -int ip6tc_rename_chain(const ip6t_chainlabel oldname, - const ip6t_chainlabel newname, - ip6tc_handle_t *handle); +int ip6tc_rename_chain(const xt_chainlabel oldname, + const xt_chainlabel newname, + struct xtc_handle *handle); /* Sets the policy on a built-in chain. */ -int ip6tc_set_policy(const ip6t_chainlabel chain, - const ip6t_chainlabel policy, - struct ip6t_counters *counters, - ip6tc_handle_t *handle); +int ip6tc_set_policy(const xt_chainlabel chain, + const xt_chainlabel policy, + struct xt_counters *counters, + struct xtc_handle *handle); /* Get the number of references to this chain */ -int ip6tc_get_references(unsigned int *ref, const ip6t_chainlabel chain, - ip6tc_handle_t *handle); +int ip6tc_get_references(unsigned int *ref, const xt_chainlabel chain, + struct xtc_handle *handle); /* read packet and byte counters for a specific rule */ -struct ip6t_counters *ip6tc_read_counter(const ip6t_chainlabel chain, +struct xt_counters *ip6tc_read_counter(const xt_chainlabel chain, unsigned int rulenum, - ip6tc_handle_t *handle); + struct xtc_handle *handle); /* zero packet and byte counters for a specific rule */ -int ip6tc_zero_counter(const ip6t_chainlabel chain, +int ip6tc_zero_counter(const xt_chainlabel chain, unsigned int rulenum, - ip6tc_handle_t *handle); + struct xtc_handle *handle); /* set packet and byte counters for a specific rule */ -int ip6tc_set_counter(const ip6t_chainlabel chain, +int ip6tc_set_counter(const xt_chainlabel chain, unsigned int rulenum, - struct ip6t_counters *counters, - ip6tc_handle_t *handle); + struct xt_counters *counters, + struct xtc_handle *handle); /* Makes the actual changes. */ -int ip6tc_commit(ip6tc_handle_t *handle); +int ip6tc_commit(struct xtc_handle *handle); /* Get raw socket. */ -int ip6tc_get_raw_socket(); +int ip6tc_get_raw_socket(void); /* Translates errno numbers into more human-readable form than strerror. */ const char *ip6tc_strerror(int err); -/* Return prefix length, or -1 if not contiguous */ -int ipv6_prefix_length(const struct in6_addr *a); +extern void dump_entries6(struct xtc_handle *const); + +extern const struct xtc_ops ip6tc_ops; #endif /* _LIBIP6TC_H */ diff --git a/include/libiptc/libiptc.h b/include/libiptc/libiptc.h index 7628bda6..24cdbdb7 100644 --- a/include/libiptc/libiptc.h +++ b/include/libiptc/libiptc.h @@ -2,155 +2,157 @@ #define _LIBIPTC_H /* Library which manipulates filtering rules. */ +#include #include +#ifdef __cplusplus +# include +#else +# include /* INT_MAX in ip_tables.h */ +#endif #include +#include #ifdef __cplusplus extern "C" { #endif -#ifndef IPT_MIN_ALIGN -/* ipt_entry has pointers and u_int64_t's in it, so if you align to - it, you'll also align to any crazy matches and targets someone - might write */ -#define IPT_MIN_ALIGN (__alignof__(struct ipt_entry)) -#endif - -#define IPT_ALIGN(s) (((s) + ((IPT_MIN_ALIGN)-1)) & ~((IPT_MIN_ALIGN)-1)) - -typedef char ipt_chainlabel[32]; +#define iptc_handle xtc_handle +#define ipt_chainlabel xt_chainlabel #define IPTC_LABEL_ACCEPT "ACCEPT" #define IPTC_LABEL_DROP "DROP" #define IPTC_LABEL_QUEUE "QUEUE" #define IPTC_LABEL_RETURN "RETURN" -/* Transparent handle type. */ -typedef struct iptc_handle *iptc_handle_t; - /* Does this chain exist? */ -int iptc_is_chain(const char *chain, const iptc_handle_t handle); +int iptc_is_chain(const char *chain, struct xtc_handle *const handle); /* Take a snapshot of the rules. Returns NULL on error. */ -iptc_handle_t iptc_init(const char *tablename); +struct xtc_handle *iptc_init(const char *tablename); /* Cleanup after iptc_init(). */ -void iptc_free(iptc_handle_t *h); +void iptc_free(struct xtc_handle *h); /* Iterator functions to run through the chains. Returns NULL at end. */ -const char *iptc_first_chain(iptc_handle_t *handle); -const char *iptc_next_chain(iptc_handle_t *handle); +const char *iptc_first_chain(struct xtc_handle *handle); +const char *iptc_next_chain(struct xtc_handle *handle); /* Get first rule in the given chain: NULL for empty chain. */ const struct ipt_entry *iptc_first_rule(const char *chain, - iptc_handle_t *handle); + struct xtc_handle *handle); /* Returns NULL when rules run out. */ const struct ipt_entry *iptc_next_rule(const struct ipt_entry *prev, - iptc_handle_t *handle); + struct xtc_handle *handle); /* Returns a pointer to the target name of this entry. */ const char *iptc_get_target(const struct ipt_entry *e, - iptc_handle_t *handle); + struct xtc_handle *handle); /* Is this a built-in chain? */ -int iptc_builtin(const char *chain, const iptc_handle_t handle); +int iptc_builtin(const char *chain, struct xtc_handle *const handle); /* Get the policy of a given built-in chain */ const char *iptc_get_policy(const char *chain, - struct ipt_counters *counter, - iptc_handle_t *handle); + struct xt_counters *counter, + struct xtc_handle *handle); /* These functions return TRUE for OK or 0 and set errno. If errno == 0, it means there was a version error (ie. upgrade libiptc). */ /* Rule numbers start at 1 for the first rule. */ /* Insert the entry `e' in chain `chain' into position `rulenum'. */ -int iptc_insert_entry(const ipt_chainlabel chain, +int iptc_insert_entry(const xt_chainlabel chain, const struct ipt_entry *e, unsigned int rulenum, - iptc_handle_t *handle); + struct xtc_handle *handle); /* Atomically replace rule `rulenum' in `chain' with `e'. */ -int iptc_replace_entry(const ipt_chainlabel chain, +int iptc_replace_entry(const xt_chainlabel chain, const struct ipt_entry *e, unsigned int rulenum, - iptc_handle_t *handle); + struct xtc_handle *handle); /* Append entry `e' to chain `chain'. Equivalent to insert with rulenum = length of chain. */ -int iptc_append_entry(const ipt_chainlabel chain, +int iptc_append_entry(const xt_chainlabel chain, const struct ipt_entry *e, - iptc_handle_t *handle); + struct xtc_handle *handle); + +/* Check whether a mathching rule exists */ +int iptc_check_entry(const xt_chainlabel chain, + const struct ipt_entry *origfw, + unsigned char *matchmask, + struct xtc_handle *handle); /* Delete the first rule in `chain' which matches `e', subject to matchmask (array of length == origfw) */ -int iptc_delete_entry(const ipt_chainlabel chain, +int iptc_delete_entry(const xt_chainlabel chain, const struct ipt_entry *origfw, unsigned char *matchmask, - iptc_handle_t *handle); + struct xtc_handle *handle); /* Delete the rule in position `rulenum' in `chain'. */ -int iptc_delete_num_entry(const ipt_chainlabel chain, +int iptc_delete_num_entry(const xt_chainlabel chain, unsigned int rulenum, - iptc_handle_t *handle); + struct xtc_handle *handle); /* Check the packet `e' on chain `chain'. Returns the verdict, or NULL and sets errno. */ -const char *iptc_check_packet(const ipt_chainlabel chain, +const char *iptc_check_packet(const xt_chainlabel chain, struct ipt_entry *entry, - iptc_handle_t *handle); + struct xtc_handle *handle); /* Flushes the entries in the given chain (ie. empties chain). */ -int iptc_flush_entries(const ipt_chainlabel chain, - iptc_handle_t *handle); +int iptc_flush_entries(const xt_chainlabel chain, + struct xtc_handle *handle); /* Zeroes the counters in a chain. */ -int iptc_zero_entries(const ipt_chainlabel chain, - iptc_handle_t *handle); +int iptc_zero_entries(const xt_chainlabel chain, + struct xtc_handle *handle); /* Creates a new chain. */ -int iptc_create_chain(const ipt_chainlabel chain, - iptc_handle_t *handle); +int iptc_create_chain(const xt_chainlabel chain, + struct xtc_handle *handle); /* Deletes a chain. */ -int iptc_delete_chain(const ipt_chainlabel chain, - iptc_handle_t *handle); +int iptc_delete_chain(const xt_chainlabel chain, + struct xtc_handle *handle); /* Renames a chain. */ -int iptc_rename_chain(const ipt_chainlabel oldname, - const ipt_chainlabel newname, - iptc_handle_t *handle); +int iptc_rename_chain(const xt_chainlabel oldname, + const xt_chainlabel newname, + struct xtc_handle *handle); /* Sets the policy on a built-in chain. */ -int iptc_set_policy(const ipt_chainlabel chain, - const ipt_chainlabel policy, - struct ipt_counters *counters, - iptc_handle_t *handle); +int iptc_set_policy(const xt_chainlabel chain, + const xt_chainlabel policy, + struct xt_counters *counters, + struct xtc_handle *handle); /* Get the number of references to this chain */ int iptc_get_references(unsigned int *ref, - const ipt_chainlabel chain, - iptc_handle_t *handle); + const xt_chainlabel chain, + struct xtc_handle *handle); /* read packet and byte counters for a specific rule */ -struct ipt_counters *iptc_read_counter(const ipt_chainlabel chain, +struct xt_counters *iptc_read_counter(const xt_chainlabel chain, unsigned int rulenum, - iptc_handle_t *handle); + struct xtc_handle *handle); /* zero packet and byte counters for a specific rule */ -int iptc_zero_counter(const ipt_chainlabel chain, +int iptc_zero_counter(const xt_chainlabel chain, unsigned int rulenum, - iptc_handle_t *handle); + struct xtc_handle *handle); /* set packet and byte counters for a specific rule */ -int iptc_set_counter(const ipt_chainlabel chain, +int iptc_set_counter(const xt_chainlabel chain, unsigned int rulenum, - struct ipt_counters *counters, - iptc_handle_t *handle); + struct xt_counters *counters, + struct xtc_handle *handle); /* Makes the actual changes. */ -int iptc_commit(iptc_handle_t *handle); +int iptc_commit(struct xtc_handle *handle); /* Get raw socket. */ int iptc_get_raw_socket(void); @@ -158,6 +160,10 @@ int iptc_get_raw_socket(void); /* Translates errno numbers into more human-readable form than strerror. */ const char *iptc_strerror(int err); +extern void dump_entries(struct xtc_handle *const); + +extern const struct xtc_ops iptc_ops; + #ifdef __cplusplus } #endif diff --git a/include/libiptc/libxtc.h b/include/libiptc/libxtc.h new file mode 100644 index 00000000..37010188 --- /dev/null +++ b/include/libiptc/libxtc.h @@ -0,0 +1,33 @@ +#ifndef _LIBXTC_H +#define _LIBXTC_H +/* Library which manipulates filtering rules. */ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef XT_MIN_ALIGN +/* xt_entry has pointers and u_int64_t's in it, so if you align to + it, you'll also align to any crazy matches and targets someone + might write */ +#define XT_MIN_ALIGN (__alignof__(struct xt_entry)) +#endif + +#ifndef XT_ALIGN +#define XT_ALIGN(s) (((s) + ((XT_MIN_ALIGN)-1)) & ~((XT_MIN_ALIGN)-1)) +#endif + +#define XTC_LABEL_ACCEPT "ACCEPT" +#define XTC_LABEL_DROP "DROP" +#define XTC_LABEL_QUEUE "QUEUE" +#define XTC_LABEL_RETURN "RETURN" + + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBXTC_H */ diff --git a/include/libiptc/xtcshared.h b/include/libiptc/xtcshared.h new file mode 100644 index 00000000..773ebc4c --- /dev/null +++ b/include/libiptc/xtcshared.h @@ -0,0 +1,20 @@ +#ifndef _LIBXTC_SHARED_H +#define _LIBXTC_SHARED_H 1 + +typedef char xt_chainlabel[32]; +struct xtc_handle; +struct xt_counters; + +struct xtc_ops { + int (*commit)(struct xtc_handle *); + void (*free)(struct xtc_handle *); + int (*builtin)(const char *, struct xtc_handle *const); + int (*is_chain)(const char *, struct xtc_handle *const); + int (*flush_entries)(const xt_chainlabel, struct xtc_handle *); + int (*create_chain)(const xt_chainlabel, struct xtc_handle *); + int (*set_policy)(const xt_chainlabel, const xt_chainlabel, + struct xt_counters *, struct xtc_handle *); + const char *(*strerror)(int); +}; + +#endif /* _LIBXTC_SHARED_H */ diff --git a/include/xtables.h b/include/xtables.h new file mode 100644 index 00000000..978ae0d1 --- /dev/null +++ b/include/xtables.h @@ -0,0 +1,567 @@ +#ifndef _XTABLES_H +#define _XTABLES_H + +/* + * Changing any structs/functions may incur a needed change + * in libxtables_vcurrent/vage too. + */ + +#include /* PF_* */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef IPPROTO_SCTP +#define IPPROTO_SCTP 132 +#endif +#ifndef IPPROTO_DCCP +#define IPPROTO_DCCP 33 +#endif +#ifndef IPPROTO_MH +# define IPPROTO_MH 135 +#endif +#ifndef IPPROTO_UDPLITE +#define IPPROTO_UDPLITE 136 +#endif + +#include + +struct in_addr; + +/* + * .size is here so that there is a somewhat reasonable check + * against the chosen .type. + */ +#define XTOPT_POINTER(stype, member) \ + .ptroff = offsetof(stype, member), \ + .size = sizeof(((stype *)NULL)->member) +#define XTOPT_TABLEEND {.name = NULL} + +/** + * Select the format the input has to conform to, as well as the target type + * (area pointed to with XTOPT_POINTER). Note that the storing is not always + * uniform. @cb->val will be populated with as much as there is space, i.e. + * exactly 2 items for ranges, but the target area can receive more values + * (e.g. in case of ranges), or less values (e.g. %XTTYPE_HOSTMASK). + * + * %XTTYPE_NONE: option takes no argument + * %XTTYPE_UINT*: standard integer + * %XTTYPE_UINT*RC: colon-separated range of standard integers + * %XTTYPE_DOUBLE: double-precision floating point number + * %XTTYPE_STRING: arbitrary string + * %XTTYPE_TOSMASK: 8-bit TOS value with optional mask + * %XTTYPE_MARKMASK32: 32-bit mark with optional mask + * %XTTYPE_SYSLOGLEVEL: syslog level by name or number + * %XTTYPE_HOST: one host or address (ptr: union nf_inet_addr) + * %XTTYPE_HOSTMASK: one host or address, with an optional prefix length + * (ptr: union nf_inet_addr; only host portion is stored) + * %XTTYPE_PROTOCOL: protocol number/name from /etc/protocols (ptr: uint8_t) + * %XTTYPE_PORT: 16-bit port name or number (supports %XTOPT_NBO) + * %XTTYPE_PORTRC: colon-separated port range (names acceptable), + * (supports %XTOPT_NBO) + * %XTTYPE_PLEN: prefix length + * %XTTYPE_PLENMASK: prefix length (ptr: union nf_inet_addr) + * %XTTYPE_ETHERMAC: Ethernet MAC address in hex form + */ +enum xt_option_type { + XTTYPE_NONE, + XTTYPE_UINT8, + XTTYPE_UINT16, + XTTYPE_UINT32, + XTTYPE_UINT64, + XTTYPE_UINT8RC, + XTTYPE_UINT16RC, + XTTYPE_UINT32RC, + XTTYPE_UINT64RC, + XTTYPE_DOUBLE, + XTTYPE_STRING, + XTTYPE_TOSMASK, + XTTYPE_MARKMASK32, + XTTYPE_SYSLOGLEVEL, + XTTYPE_HOST, + XTTYPE_HOSTMASK, + XTTYPE_PROTOCOL, + XTTYPE_PORT, + XTTYPE_PORTRC, + XTTYPE_PLEN, + XTTYPE_PLENMASK, + XTTYPE_ETHERMAC, +}; + +/** + * %XTOPT_INVERT: option is invertible (usable with !) + * %XTOPT_MAND: option is mandatory + * %XTOPT_MULTI: option may be specified multiple times + * %XTOPT_PUT: store value into memory at @ptroff + * %XTOPT_NBO: store value in network-byte order + * (only certain XTTYPEs recognize this) + */ +enum xt_option_flags { + XTOPT_INVERT = 1 << 0, + XTOPT_MAND = 1 << 1, + XTOPT_MULTI = 1 << 2, + XTOPT_PUT = 1 << 3, + XTOPT_NBO = 1 << 4, +}; + +/** + * @name: name of option + * @type: type of input and validation method, see %XTTYPE_* + * @id: unique number (within extension) for option, 0-31 + * @excl: bitmask of flags that cannot be used with this option + * @also: bitmask of flags that must be used with this option + * @flags: bitmask of option flags, see %XTOPT_* + * @ptroff: offset into private structure for member + * @size: size of the item pointed to by @ptroff; this is a safeguard + * @min: lowest allowed value (for singular integral types) + * @max: highest allowed value (for singular integral types) + */ +struct xt_option_entry { + const char *name; + enum xt_option_type type; + unsigned int id, excl, also, flags; + unsigned int ptroff; + size_t size; + unsigned int min, max; +}; + +/** + * @arg: input from command line + * @ext_name: name of extension currently being processed + * @entry: current option being processed + * @data: per-extension kernel data block + * @xflags: options of the extension that have been used + * @invert: whether option was used with ! + * @nvals: number of results in uXX_multi + * @val: parsed result + * @udata: per-extension private scratch area + * (cf. xtables_{match,target}->udata_size) + */ +struct xt_option_call { + const char *arg, *ext_name; + const struct xt_option_entry *entry; + void *data; + unsigned int xflags; + bool invert; + uint8_t nvals; + union { + uint8_t u8, u8_range[2], syslog_level, protocol; + uint16_t u16, u16_range[2], port, port_range[2]; + uint32_t u32, u32_range[2]; + uint64_t u64, u64_range[2]; + double dbl; + struct { + union nf_inet_addr haddr, hmask; + uint8_t hlen; + }; + struct { + uint8_t tos_value, tos_mask; + }; + struct { + uint32_t mark, mask; + }; + uint8_t ethermac[6]; + } val; + /* Wished for a world where the ones below were gone: */ + union { + struct xt_entry_match **match; + struct xt_entry_target **target; + }; + void *xt_entry; + void *udata; +}; + +/** + * @ext_name: name of extension currently being processed + * @data: per-extension (kernel) data block + * @udata: per-extension private scratch area + * (cf. xtables_{match,target}->udata_size) + * @xflags: options of the extension that have been used + */ +struct xt_fcheck_call { + const char *ext_name; + void *data, *udata; + unsigned int xflags; +}; + +/** + * A "linear"/linked-list based name<->id map, for files similar to + * /etc/iproute2/. + */ +struct xtables_lmap { + char *name; + int id; + struct xtables_lmap *next; +}; + +enum xtables_ext_flags { + XTABLES_EXT_ALIAS = 1 << 0, +}; + +/* Include file for additions: new matches and targets. */ +struct xtables_match +{ + /* + * ABI/API version this module requires. Must be first member, + * as the rest of this struct may be subject to ABI changes. + */ + const char *version; + + struct xtables_match *next; + + const char *name; + const char *real_name; + + /* Revision of match (0 by default). */ + uint8_t revision; + + /* Extension flags */ + uint8_t ext_flags; + + uint16_t family; + + /* Size of match data. */ + size_t size; + + /* Size of match data relevant for userspace comparison purposes */ + size_t userspacesize; + + /* Function which prints out usage message. */ + void (*help)(void); + + /* Initialize the match. */ + void (*init)(struct xt_entry_match *m); + + /* Function which parses command options; returns true if it + ate an option */ + /* entry is struct ipt_entry for example */ + int (*parse)(int c, char **argv, int invert, unsigned int *flags, + const void *entry, + struct xt_entry_match **match); + + /* Final check; exit if not ok. */ + void (*final_check)(unsigned int flags); + + /* Prints out the match iff non-NULL: put space at end */ + /* ip is struct ipt_ip * for example */ + void (*print)(const void *ip, + const struct xt_entry_match *match, int numeric); + + /* Saves the match info in parsable form to stdout. */ + /* ip is struct ipt_ip * for example */ + void (*save)(const void *ip, const struct xt_entry_match *match); + + /* Print match name or alias */ + const char *(*alias)(const struct xt_entry_match *match); + + /* Pointer to list of extra command-line options */ + const struct option *extra_opts; + + /* New parser */ + void (*x6_parse)(struct xt_option_call *); + void (*x6_fcheck)(struct xt_fcheck_call *); + const struct xt_option_entry *x6_options; + + /* Size of per-extension instance extra "global" scratch space */ + size_t udata_size; + + /* Ignore these men behind the curtain: */ + void *udata; + unsigned int option_offset; + struct xt_entry_match *m; + unsigned int mflags; + unsigned int loaded; /* simulate loading so options are merged properly */ +}; + +struct xtables_target +{ + /* + * ABI/API version this module requires. Must be first member, + * as the rest of this struct may be subject to ABI changes. + */ + const char *version; + + struct xtables_target *next; + + + const char *name; + + /* Real target behind this, if any. */ + const char *real_name; + + /* Revision of target (0 by default). */ + uint8_t revision; + + /* Extension flags */ + uint8_t ext_flags; + + uint16_t family; + + + /* Size of target data. */ + size_t size; + + /* Size of target data relevant for userspace comparison purposes */ + size_t userspacesize; + + /* Function which prints out usage message. */ + void (*help)(void); + + /* Initialize the target. */ + void (*init)(struct xt_entry_target *t); + + /* Function which parses command options; returns true if it + ate an option */ + /* entry is struct ipt_entry for example */ + int (*parse)(int c, char **argv, int invert, unsigned int *flags, + const void *entry, + struct xt_entry_target **targetinfo); + + /* Final check; exit if not ok. */ + void (*final_check)(unsigned int flags); + + /* Prints out the target iff non-NULL: put space at end */ + void (*print)(const void *ip, + const struct xt_entry_target *target, int numeric); + + /* Saves the targinfo in parsable form to stdout. */ + void (*save)(const void *ip, + const struct xt_entry_target *target); + + /* Print target name or alias */ + const char *(*alias)(const struct xt_entry_target *target); + + /* Pointer to list of extra command-line options */ + const struct option *extra_opts; + + /* New parser */ + void (*x6_parse)(struct xt_option_call *); + void (*x6_fcheck)(struct xt_fcheck_call *); + const struct xt_option_entry *x6_options; + + size_t udata_size; + + /* Ignore these men behind the curtain: */ + void *udata; + unsigned int option_offset; + struct xt_entry_target *t; + unsigned int tflags; + unsigned int used; + unsigned int loaded; /* simulate loading so options are merged properly */ +}; + +struct xtables_rule_match { + struct xtables_rule_match *next; + struct xtables_match *match; + /* Multiple matches of the same type: the ones before + the current one are completed from parsing point of view */ + bool completed; +}; + +/** + * struct xtables_pprot - + * + * A few hardcoded protocols for 'all' and in case the user has no + * /etc/protocols. + */ +struct xtables_pprot { + const char *name; + uint8_t num; +}; + +enum xtables_tryload { + XTF_DONT_LOAD, + XTF_DURING_LOAD, + XTF_TRY_LOAD, + XTF_LOAD_MUST_SUCCEED, +}; + +enum xtables_exittype { + OTHER_PROBLEM = 1, + PARAMETER_PROBLEM, + VERSION_PROBLEM, + RESOURCE_PROBLEM, + XTF_ONLY_ONCE, + XTF_NO_INVERT, + XTF_BAD_VALUE, + XTF_ONE_ACTION, +}; + +struct xtables_globals +{ + unsigned int option_offset; + const char *program_name, *program_version; + struct option *orig_opts; + struct option *opts; + void (*exit_err)(enum xtables_exittype status, const char *msg, ...) __attribute__((noreturn, format(printf,2,3))); + int (*compat_rev)(const char *name, uint8_t rev, int opt); +}; + +#define XT_GETOPT_TABLEEND {.name = NULL, .has_arg = false} + +#ifdef __cplusplus +extern "C" { +#endif + +extern const char *xtables_modprobe_program; +extern struct xtables_match *xtables_matches; +extern struct xtables_target *xtables_targets; + +extern void xtables_init(void); +extern void xtables_set_nfproto(uint8_t); +extern void *xtables_calloc(size_t, size_t); +extern void *xtables_malloc(size_t); +extern void *xtables_realloc(void *, size_t); + +extern int xtables_insmod(const char *, const char *, bool); +extern int xtables_load_ko(const char *, bool); +extern int xtables_set_params(struct xtables_globals *xtp); +extern void xtables_free_opts(int reset_offset); +extern struct option *xtables_merge_options(struct option *origopts, + struct option *oldopts, const struct option *newopts, + unsigned int *option_offset); + +extern int xtables_init_all(struct xtables_globals *xtp, uint8_t nfproto); +extern struct xtables_match *xtables_find_match(const char *name, + enum xtables_tryload, struct xtables_rule_match **match); +extern struct xtables_target *xtables_find_target(const char *name, + enum xtables_tryload); +extern int xtables_compatible_revision(const char *name, uint8_t revision, + int opt); + +extern void xtables_rule_matches_free(struct xtables_rule_match **matches); + +/* Your shared library should call one of these. */ +extern void xtables_register_match(struct xtables_match *me); +extern void xtables_register_matches(struct xtables_match *, unsigned int); +extern void xtables_register_target(struct xtables_target *me); +extern void xtables_register_targets(struct xtables_target *, unsigned int); + +extern bool xtables_strtoul(const char *, char **, uintmax_t *, + uintmax_t, uintmax_t); +extern bool xtables_strtoui(const char *, char **, unsigned int *, + unsigned int, unsigned int); +extern int xtables_service_to_port(const char *name, const char *proto); +extern uint16_t xtables_parse_port(const char *port, const char *proto); +extern void +xtables_parse_interface(const char *arg, char *vianame, unsigned char *mask); + +/* this is a special 64bit data type that is 8-byte aligned */ +#define aligned_u64 uint64_t __attribute__((aligned(8))) + +extern struct xtables_globals *xt_params; +#define xtables_error (xt_params->exit_err) + +extern void xtables_param_act(unsigned int, const char *, ...); + +extern const char *xtables_ipaddr_to_numeric(const struct in_addr *); +extern const char *xtables_ipaddr_to_anyname(const struct in_addr *); +extern const char *xtables_ipmask_to_numeric(const struct in_addr *); +extern struct in_addr *xtables_numeric_to_ipaddr(const char *); +extern struct in_addr *xtables_numeric_to_ipmask(const char *); +extern int xtables_ipmask_to_cidr(const struct in_addr *); +extern void xtables_ipparse_any(const char *, struct in_addr **, + struct in_addr *, unsigned int *); +extern void xtables_ipparse_multiple(const char *, struct in_addr **, + struct in_addr **, unsigned int *); + +extern struct in6_addr *xtables_numeric_to_ip6addr(const char *); +extern const char *xtables_ip6addr_to_numeric(const struct in6_addr *); +extern const char *xtables_ip6addr_to_anyname(const struct in6_addr *); +extern const char *xtables_ip6mask_to_numeric(const struct in6_addr *); +extern int xtables_ip6mask_to_cidr(const struct in6_addr *); +extern void xtables_ip6parse_any(const char *, struct in6_addr **, + struct in6_addr *, unsigned int *); +extern void xtables_ip6parse_multiple(const char *, struct in6_addr **, + struct in6_addr **, unsigned int *); + +/** + * Print the specified value to standard output, quoting dangerous + * characters if required. + */ +extern void xtables_save_string(const char *value); + +#define FMT_NUMERIC 0x0001 +#define FMT_NOCOUNTS 0x0002 +#define FMT_KILOMEGAGIGA 0x0004 +#define FMT_OPTIONS 0x0008 +#define FMT_NOTABLE 0x0010 +#define FMT_NOTARGET 0x0020 +#define FMT_VIA 0x0040 +#define FMT_NONEWLINE 0x0080 +#define FMT_LINENUMBERS 0x0100 + +#define FMT_PRINT_RULE (FMT_NOCOUNTS | FMT_OPTIONS | FMT_VIA \ + | FMT_NUMERIC | FMT_NOTABLE) +#define FMT(tab,notab) ((format) & FMT_NOTABLE ? (notab) : (tab)) + +extern void xtables_print_num(uint64_t number, unsigned int format); + +#if defined(ALL_INCLUSIVE) || defined(NO_SHARED_LIBS) +# ifdef _INIT +# undef _init +# define _init _INIT +# endif + extern void init_extensions(void); + extern void init_extensions4(void); + extern void init_extensions6(void); +#else +# define _init __attribute__((constructor)) _INIT +#endif + +extern const struct xtables_pprot xtables_chain_protos[]; +extern uint16_t xtables_parse_protocol(const char *s); + +/* kernel revision handling */ +extern int kernel_version; +extern void get_kernel_version(void); +#define LINUX_VERSION(x,y,z) (0x10000*(x) + 0x100*(y) + z) +#define LINUX_VERSION_MAJOR(x) (((x)>>16) & 0xFF) +#define LINUX_VERSION_MINOR(x) (((x)>> 8) & 0xFF) +#define LINUX_VERSION_PATCH(x) ( (x) & 0xFF) + +/* xtoptions.c */ +extern void xtables_option_metavalidate(const char *, + const struct xt_option_entry *); +extern struct option *xtables_options_xfrm(struct option *, struct option *, + const struct xt_option_entry *, + unsigned int *); +extern void xtables_option_parse(struct xt_option_call *); +extern void xtables_option_tpcall(unsigned int, char **, bool, + struct xtables_target *, void *); +extern void xtables_option_mpcall(unsigned int, char **, bool, + struct xtables_match *, void *); +extern void xtables_option_tfcall(struct xtables_target *); +extern void xtables_option_mfcall(struct xtables_match *); +extern void xtables_options_fcheck(const char *, unsigned int, + const struct xt_option_entry *); + +extern struct xtables_lmap *xtables_lmap_init(const char *); +extern void xtables_lmap_free(struct xtables_lmap *); +extern int xtables_lmap_name2id(const struct xtables_lmap *, const char *); +extern const char *xtables_lmap_id2name(const struct xtables_lmap *, int); + +#ifdef XTABLES_INTERNAL + +/* Shipped modules rely on this... */ + +# ifndef ARRAY_SIZE +# define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) +# endif + +extern void _init(void); + +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* _XTABLES_H */ From a4c89d808734fe980813587cc1b7899835376a62 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 6 Jan 2016 09:14:29 -0800 Subject: [PATCH 128/151] update most kernel headers still have issues with xtables --- include/linux/bpf.h | 1 + include/linux/if_link.h | 1 + include/linux/ila.h | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2e2524d4..39e7f33c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -269,6 +269,7 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_perf_event_output, + BPF_FUNC_skb_load_bytes, __BPF_FUNC_MAX_ID, }; diff --git a/include/linux/if_link.h b/include/linux/if_link.h index c9ad487d..d91f2c97 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -216,6 +216,7 @@ enum in6_addr_gen_mode { IN6_ADDR_GEN_MODE_EUI64, IN6_ADDR_GEN_MODE_NONE, IN6_ADDR_GEN_MODE_STABLE_PRIVACY, + IN6_ADDR_GEN_MODE_RANDOM, }; /* Bridge section */ diff --git a/include/linux/ila.h b/include/linux/ila.h index f08e8d77..4f9e1dea 100644 --- a/include/linux/ila.h +++ b/include/linux/ila.h @@ -3,13 +3,35 @@ #ifndef _LINUX_ILA_H #define _LINUX_ILA_H +/* NETLINK_GENERIC related info */ +#define ILA_GENL_NAME "ila" +#define ILA_GENL_VERSION 0x1 + enum { ILA_ATTR_UNSPEC, ILA_ATTR_LOCATOR, /* u64 */ + ILA_ATTR_IDENTIFIER, /* u64 */ + ILA_ATTR_LOCATOR_MATCH, /* u64 */ + ILA_ATTR_IFINDEX, /* s32 */ + ILA_ATTR_DIR, /* u32 */ __ILA_ATTR_MAX, }; #define ILA_ATTR_MAX (__ILA_ATTR_MAX - 1) +enum { + ILA_CMD_UNSPEC, + ILA_CMD_ADD, + ILA_CMD_DEL, + ILA_CMD_GET, + + __ILA_CMD_MAX, +}; + +#define ILA_CMD_MAX (__ILA_CMD_MAX - 1) + +#define ILA_DIR_IN (1 << 0) +#define ILA_DIR_OUT (1 << 1) + #endif /* _LINUX_ILA_H */ From 8e098dd81a1d915c641a094a0bc2bf1c52a8e14e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Mon, 4 Jan 2016 10:58:04 +0100 Subject: [PATCH 129/151] iplink: support setting addrgenmode stable_secret MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible to switch to another addrgenmode after setting a valid secret. Allow switching back without reconfiguring the secret for completeness. Cc: Hannes Frederic Sowa Signed-off-by: Bjørn Mork --- ip/iplink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ip/iplink.c b/ip/iplink.c index f30de86d..e824082f 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -84,7 +84,7 @@ void iplink_usage(void) fprintf(stderr, " [ state { auto | enable | disable} ] ]\n"); fprintf(stderr, " [ master DEVICE ]\n"); fprintf(stderr, " [ nomaster ]\n"); - fprintf(stderr, " [ addrgenmode { eui64 | none } ]\n"); + fprintf(stderr, " [ addrgenmode { eui64 | none | stable_secret } ]\n"); fprintf(stderr, " [ protodown { on | off } ]\n"); fprintf(stderr, " ip link show [ DEVICE | group GROUP ] [up] [master DEV] [type TYPE]\n"); @@ -176,6 +176,8 @@ static int get_addr_gen_mode(const char *mode) return IN6_ADDR_GEN_MODE_EUI64; if (strcasecmp(mode, "none") == 0) return IN6_ADDR_GEN_MODE_NONE; + if (strcasecmp(mode, "stable_secret") == 0) + return IN6_ADDR_GEN_MODE_STABLE_PRIVACY; return -1; } From 8e12bc0a9df46516b851c9baa48d9da6b0999022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Mon, 4 Jan 2016 10:58:05 +0100 Subject: [PATCH 130/151] iplink: support show and set of "addrgenmode random" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "random" is a new IPv6 addrgenmode, enabling "stable_secret" type addresses with an auto-generated secret. $ ip link set eth0 addrgenmode random $ ip -d link show dev eth0 2: eth0: mtu 1500 qdisc pfifo_fast state DOWN mode DEFAULT group default qlen 1000 link/ether 00:21:86:a3:25:7d brd ff:ff:ff:ff:ff:ff promiscuity 0 addrgenmode random Cc: Hannes Frederic Sowa Signed-off-by: Bjørn Mork --- ip/ipaddress.c | 3 +++ ip/iplink.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index a495a391..9d254d27 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -296,6 +296,9 @@ static void print_af_spec(FILE *fp, struct rtattr *af_spec_attr) case IN6_ADDR_GEN_MODE_STABLE_PRIVACY: fprintf(fp, "addrgenmode stable_secret "); break; + case IN6_ADDR_GEN_MODE_RANDOM: + fprintf(fp, "addrgenmode random "); + break; default: fprintf(fp, "addrgenmode %#.2hhx ", mode); break; diff --git a/ip/iplink.c b/ip/iplink.c index e824082f..75b21540 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -84,7 +84,7 @@ void iplink_usage(void) fprintf(stderr, " [ state { auto | enable | disable} ] ]\n"); fprintf(stderr, " [ master DEVICE ]\n"); fprintf(stderr, " [ nomaster ]\n"); - fprintf(stderr, " [ addrgenmode { eui64 | none | stable_secret } ]\n"); + fprintf(stderr, " [ addrgenmode { eui64 | none | stable_secret | random } ]\n"); fprintf(stderr, " [ protodown { on | off } ]\n"); fprintf(stderr, " ip link show [ DEVICE | group GROUP ] [up] [master DEV] [type TYPE]\n"); @@ -178,6 +178,8 @@ static int get_addr_gen_mode(const char *mode) return IN6_ADDR_GEN_MODE_NONE; if (strcasecmp(mode, "stable_secret") == 0) return IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + if (strcasecmp(mode, "random") == 0) + return IN6_ADDR_GEN_MODE_RANDOM; return -1; } From 8f0777a857679678f3ff89e05dbde4594a58930c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Mon, 4 Jan 2016 10:58:06 +0100 Subject: [PATCH 131/151] man: iplink: document new addrgenmodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Hannes Frederic Sowa Signed-off-by: Bjørn Mork --- man/man8/ip-link.8.in | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index ac6f4813..189a8f15 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -150,7 +150,7 @@ ip-link \- network device configuration .br .B nomaster " |" .br -.B addrgenmode { eui64 | none } +.B addrgenmode { eui64 | none | stable_secret | random } .br .B link-netnsid ID .BR " }" @@ -1029,8 +1029,20 @@ set master device of the device (enslave device). unset master device of the device (release device). .TP -.BR "addrgenmode eui64 " or " addrgenmode none" -set IPv6 address generation mode +.BI addrgenmode " eui64|none|stable_secret|random" +set the IPv6 address generation mode + +.I eui64 +- use a Modified EUI-64 format interface identifier + +.I none +- disable automatic address generation + +.I stable_secret +- generate the interface identifier based on a preset /proc/sys/net/ipv6/conf/{default,DEVICE}/stable_secret + +.I random +- like stable_secret, but auto-generate a new random secret if none is set .TP .BR "link-netnsid " From 0257369837c670867a844f9515e45b0bed44067a Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Tue, 5 Jan 2016 10:57:39 +0100 Subject: [PATCH 132/151] tipc: fix help text spelling error in node.c --- tipc/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tipc/node.c b/tipc/node.c index 163fb743..201fe1a4 100644 --- a/tipc/node.c +++ b/tipc/node.c @@ -245,7 +245,7 @@ static int cmd_node_get(struct nlmsghdr *nlh, const struct cmd *cmd, void cmd_node_help(struct cmdl *cmdl) { fprintf(stderr, - "Usage: %s media COMMAND [ARGS] ...\n\n" + "Usage: %s node COMMAND [ARGS] ...\n\n" "COMMANDS\n" " list - List remote nodes\n" " get - Get local node parameters\n" From d4585a4bb120e2f60b088a7e934bf2ae4e6b5b68 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Tue, 5 Jan 2016 10:57:40 +0100 Subject: [PATCH 133/151] tipc: add peer remove functionality This enables a user to remove an offline peer from the kernel data structures. This could for example be useful when deliberately scaling in peer nodes in a cloud environment. Signed-off-by: Richard Alpe Reviewed-by: Jon Maloy Reviewed-by: Ying Xue --- include/linux/tipc_netlink.h | 1 + man/man8/tipc-bearer.8 | 1 + man/man8/tipc-link.8 | 1 + man/man8/tipc-media.8 | 1 + man/man8/tipc-nametable.8 | 1 + man/man8/tipc-node.8 | 1 + man/man8/tipc-peer.8 | 52 ++++++++++++++++++++ man/man8/tipc.8 | 1 + tipc/Makefile | 2 +- tipc/peer.c | 93 ++++++++++++++++++++++++++++++++++++ tipc/peer.h | 21 ++++++++ tipc/tipc.c | 3 ++ 12 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 man/man8/tipc-peer.8 create mode 100644 tipc/peer.c create mode 100644 tipc/peer.h diff --git a/include/linux/tipc_netlink.h b/include/linux/tipc_netlink.h index d4c8f142..25eb645e 100644 --- a/include/linux/tipc_netlink.h +++ b/include/linux/tipc_netlink.h @@ -56,6 +56,7 @@ enum { TIPC_NL_NET_GET, TIPC_NL_NET_SET, TIPC_NL_NAME_TABLE_GET, + TIPC_NL_PEER_REMOVE, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 diff --git a/man/man8/tipc-bearer.8 b/man/man8/tipc-bearer.8 index 50a1ed24..565ee01d 100644 --- a/man/man8/tipc-bearer.8 +++ b/man/man8/tipc-bearer.8 @@ -218,6 +218,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-media (8), .BR tipc-nametable (8), .BR tipc-node (8), +.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-link.8 b/man/man8/tipc-link.8 index 3be8c9ad..2ee03a0b 100644 --- a/man/man8/tipc-link.8 +++ b/man/man8/tipc-link.8 @@ -213,6 +213,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-bearer (8), .BR tipc-nametable (8), .BR tipc-node (8), +.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-media.8 b/man/man8/tipc-media.8 index 6c6e2b15..4689cb3f 100644 --- a/man/man8/tipc-media.8 +++ b/man/man8/tipc-media.8 @@ -74,6 +74,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-link (8), .BR tipc-nametable (8), .BR tipc-node (8), +.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-nametable.8 b/man/man8/tipc-nametable.8 index d3397f97..4bcefe47 100644 --- a/man/man8/tipc-nametable.8 +++ b/man/man8/tipc-nametable.8 @@ -87,6 +87,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-link (8), .BR tipc-media (8), .BR tipc-node (8), +.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-node.8 b/man/man8/tipc-node.8 index ef32ec7c..a72a4099 100644 --- a/man/man8/tipc-node.8 +++ b/man/man8/tipc-node.8 @@ -59,6 +59,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-link (8), .BR tipc-media (8), .BR tipc-nametable (8), +.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-peer.8 b/man/man8/tipc-peer.8 new file mode 100644 index 00000000..430651f7 --- /dev/null +++ b/man/man8/tipc-peer.8 @@ -0,0 +1,52 @@ +.TH TIPC-PEER 8 "04 Dec 2015" "iproute2" "Linux" + +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-peer \- modify peer information + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 +.B tipc peer remove address +.IR ADDRESS + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc peer --help +will show peer help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. +.SH DESCRIPTION + +.SS Peer remove +Remove an offline peer node from the local data structures. The peer is +identified by its +.B address + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-bearer (8), +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe diff --git a/man/man8/tipc.8 b/man/man8/tipc.8 index c1165523..32943fa5 100644 --- a/man/man8/tipc.8 +++ b/man/man8/tipc.8 @@ -87,6 +87,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-media (8), .BR tipc-nametable (8), .BR tipc-node (8), +.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/tipc/Makefile b/tipc/Makefile index bc5ecfd3..f06dcb11 100644 --- a/tipc/Makefile +++ b/tipc/Makefile @@ -6,7 +6,7 @@ TIPCOBJ=bearer.o \ media.o misc.o \ msg.o nametable.o \ node.o socket.o \ - tipc.o + peer.o tipc.o include ../Config diff --git a/tipc/peer.c b/tipc/peer.c new file mode 100644 index 00000000..de0c73c3 --- /dev/null +++ b/tipc/peer.c @@ -0,0 +1,93 @@ +/* + * peer.c TIPC peer functionality. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Richard Alpe + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "cmdl.h" +#include "msg.h" +#include "misc.h" +#include "peer.h" + +static int cmd_peer_rm_addr(struct nlmsghdr *nlh, const struct cmd *cmd, + struct cmdl *cmdl, void *data) +{ + char *str; + uint32_t addr; + struct nlattr *nest; + char buf[MNL_SOCKET_BUFFER_SIZE]; + + if ((cmdl->argc != cmdl->optind + 1) || help_flag) { + fprintf(stderr, "Usage: %s peer remove address ADDRESS\n", + cmdl->argv[0]); + return -EINVAL; + } + + str = shift_cmdl(cmdl); + addr = str2addr(str); + if (!addr) + return -1; + + if (!(nlh = msg_init(buf, TIPC_NL_PEER_REMOVE))) { + fprintf(stderr, "error, message initialisation failed\n"); + return -1; + } + + nest = mnl_attr_nest_start(nlh, TIPC_NLA_NET); + mnl_attr_put_u32(nlh, TIPC_NLA_NET_ADDR, addr); + mnl_attr_nest_end(nlh, nest); + + return msg_doit(nlh, NULL, NULL); +} + +static void cmd_peer_rm_help(struct cmdl *cmdl) +{ + fprintf(stderr, "Usage: %s peer remove address ADDRESS\n", + cmdl->argv[0]); +} + +static int cmd_peer_rm(struct nlmsghdr *nlh, const struct cmd *cmd, + struct cmdl *cmdl, void *data) +{ + const struct cmd cmds[] = { + { "address", cmd_peer_rm_addr, cmd_peer_rm_help }, + { NULL } + }; + + return run_cmd(nlh, cmd, cmds, cmdl, NULL); +} + +void cmd_peer_help(struct cmdl *cmdl) +{ + fprintf(stderr, + "Usage: %s peer COMMAND [ARGS] ...\n\n" + "COMMANDS\n" + " remove - Remove an offline peer node\n", + cmdl->argv[0]); +} + +int cmd_peer(struct nlmsghdr *nlh, const struct cmd *cmd, struct cmdl *cmdl, + void *data) +{ + const struct cmd cmds[] = { + { "remove", cmd_peer_rm, cmd_peer_rm_help }, + { NULL } + }; + + return run_cmd(nlh, cmd, cmds, cmdl, NULL); +} diff --git a/tipc/peer.h b/tipc/peer.h new file mode 100644 index 00000000..89722616 --- /dev/null +++ b/tipc/peer.h @@ -0,0 +1,21 @@ +/* + * peer.h TIPC peer functionality. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Richard Alpe + */ + +#ifndef _TIPC_PEER_H +#define _TIPC_PEER_H + +extern int help_flag; + +int cmd_peer(struct nlmsghdr *nlh, const struct cmd *cmd, struct cmdl *cmdl, + void *data); +void cmd_peer_help(struct cmdl *cmdl); + +#endif diff --git a/tipc/tipc.c b/tipc/tipc.c index 44398052..600d5e2a 100644 --- a/tipc/tipc.c +++ b/tipc/tipc.c @@ -20,6 +20,7 @@ #include "socket.h" #include "media.h" #include "node.h" +#include "peer.h" #include "cmdl.h" int help_flag; @@ -39,6 +40,7 @@ static void about(struct cmdl *cmdl) " media - Show or modify media\n" " nametable - Show nametable\n" " node - Show or modify node related parameters\n" + " peer - Peer related operations\n" " socket - Show sockets\n", cmdl->argv[0]); } @@ -59,6 +61,7 @@ int main(int argc, char *argv[]) { "media", cmd_media, cmd_media_help}, { "nametable", cmd_nametable, cmd_nametable_help}, { "node", cmd_node, cmd_node_help}, + { "peer", cmd_peer, cmd_peer_help}, { "socket", cmd_socket, cmd_socket_help}, { NULL } }; From e947d8947d34114e2ea7c5508dfbb9b10b4611c2 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 6 Jan 2016 10:29:06 -0800 Subject: [PATCH 134/151] man: fix whatis for fq The FQ man page was not following whatis formatting rules. --- man/man8/tc-fq.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/man8/tc-fq.8 b/man/man8/tc-fq.8 index 993beb61..f058a05a 100644 --- a/man/man8/tc-fq.8 +++ b/man/man8/tc-fq.8 @@ -1,6 +1,6 @@ .TH FQ 8 "10 Sept 2015" "iproute2" "Linux" .SH NAME -Fair Queuing (FQ) \- Traffic Pacing +FQ \- Fair Queue traffic policing .SH SYNOPSIS .B tc qdisc ... fq [ From de7db5d85728fe8d9a33fe968427503c164a32c3 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 6 Jan 2016 17:46:50 +0100 Subject: [PATCH 135/151] tc: m_connmark: Fix help text When specifying a conntrack zone, the 'zone' keyword has to be used before the actual zone index. Signed-off-by: Phil Sutter --- tc/m_connmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tc/m_connmark.c b/tc/m_connmark.c index 41ca4b25..6974c9ba 100644 --- a/tc/m_connmark.c +++ b/tc/m_connmark.c @@ -27,7 +27,7 @@ static void explain(void) { - fprintf(stderr, "Usage: ... connmark [ZONE] [BRANCH] [index ]\n"); + fprintf(stderr, "Usage: ... connmark [zone ZONE] [BRANCH] [index ]\n"); fprintf(stderr, "where :\n" "\tZONE is the conntrack zone\n" "\tBRANCH := reclassify|pipe|drop|continue|ok\n"); From f921f567d1b9a7d6ad9b99049c7df816c47f92eb Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Sat, 9 Jan 2016 16:02:12 -0800 Subject: [PATCH 136/151] iplink: replace exit with return This patch replaces exits with returns in iplink command. Helps to continue on errors when invoked with ip -force -batch. Signed-off-by: Roopa Prabhu --- ip/iplink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ip/iplink.c b/ip/iplink.c index f30de86d..c706d208 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -710,7 +710,7 @@ static int iplink_modify(int cmd, unsigned int flags, int argc, char **argv) req.i.ifi_index = 0; addattr32(&req.n, sizeof(req), IFLA_GROUP, group); if (rtnl_talk(&rth, &req.n, NULL, 0) < 0) - exit(2); + return -2; return 0; } } @@ -809,7 +809,7 @@ static int iplink_modify(int cmd, unsigned int flags, int argc, char **argv) } if (rtnl_talk(&rth, &req.n, NULL, 0) < 0) - exit(2); + return -2; return 0; } From 8f80d450c3cb0996d839996807b77ca28bd4da09 Mon Sep 17 00:00:00 2001 From: Julien Floret Date: Thu, 7 Jan 2016 14:03:13 +0100 Subject: [PATCH 137/151] tc: fix compilation with old gcc (< 4.6) gcc < 4.6 does not handle C11 syntax for the static initialization of anonymous struct/union, hence the following error: tc_bpf.c:260: error: unknown field map_type specified in initializer Signed-off-by: Julien Floret Signed-off-by: Nicolas Dichtel Acked-by: Daniel Borkmann --- tc/tc_bpf.c | 48 +++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c index 276871a5..47993bad 100644 --- a/tc/tc_bpf.c +++ b/tc/tc_bpf.c @@ -257,12 +257,14 @@ static bool bpf_may_skip_map_creation(int file_fd) static int bpf_create_map(enum bpf_map_type type, unsigned int size_key, unsigned int size_value, unsigned int max_elem) { - union bpf_attr attr = { - .map_type = type, - .key_size = size_key, - .value_size = size_value, - .max_entries = max_elem, - }; + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_type = type; + attr.key_size = size_key; + attr.value_size = size_value; + attr.max_entries = max_elem; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } @@ -270,12 +272,14 @@ static int bpf_create_map(enum bpf_map_type type, unsigned int size_key, static int bpf_update_map(int fd, const void *key, const void *value, uint64_t flags) { - union bpf_attr attr = { - .map_fd = fd, - .key = bpf_ptr_to_u64(key), - .value = bpf_ptr_to_u64(value), - .flags = flags, - }; + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.map_fd = fd; + attr.key = bpf_ptr_to_u64(key); + attr.value = bpf_ptr_to_u64(value); + attr.flags = flags; return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); } @@ -283,15 +287,17 @@ static int bpf_update_map(int fd, const void *key, const void *value, static int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns, unsigned int len, const char *license) { - union bpf_attr attr = { - .prog_type = type, - .insns = bpf_ptr_to_u64(insns), - .insn_cnt = len / sizeof(struct bpf_insn), - .license = bpf_ptr_to_u64(license), - .log_buf = bpf_ptr_to_u64(bpf_log_buf), - .log_size = sizeof(bpf_log_buf), - .log_level = 1, - }; + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + + attr.prog_type = type; + attr.insns = bpf_ptr_to_u64(insns); + attr.insn_cnt = len / sizeof(struct bpf_insn); + attr.license = bpf_ptr_to_u64(license); + attr.log_buf = bpf_ptr_to_u64(bpf_log_buf); + attr.log_size = sizeof(bpf_log_buf); + attr.log_level = 1; return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } From 488b41d020fb06428b90289f70a41210718f52b7 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 10 Jan 2016 14:56:31 -0500 Subject: [PATCH 138/151] tc: flower no need to specify the ethertype since all tc classifiers are required to specify ethertype as part of grammar By not allowing eth_type to be specified we remove contradiction for example when a user specifies: tc filter add ... priority xxx protocol ip flower eth_type ipv6 This patch removes that contradiction Signed-off-by: Jamal Hadi Salim --- tc/f_flower.c | 55 ++++++++++++++++----------------------------------- 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/tc/f_flower.c b/tc/f_flower.c index a9b2c4df..db9cc296 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -31,7 +31,7 @@ static void explain(void) fprintf(stderr, " MATCH := { indev DEV-NAME | \n"); fprintf(stderr, " dst_mac MAC-ADDR | \n"); fprintf(stderr, " src_mac MAC-ADDR | \n"); - fprintf(stderr, " eth_type [ipv4 | ipv6 | ETH-TYPE ] | \n"); + fprintf(stderr, " [ipv4 | ipv6 ] | \n"); fprintf(stderr, " ip_proto [tcp | udp | IP-PROTO ] | \n"); fprintf(stderr, " dst_ip [ IPV4-ADDR | IPV6-ADDR ] | \n"); fprintf(stderr, " src_ip [ IPV4-ADDR | IPV6-ADDR ] | \n"); @@ -60,29 +60,6 @@ static int flower_parse_eth_addr(char *str, int addr_type, int mask_type, return 0; } -static int flower_parse_eth_type(char *str, int type, __be16 *p_eth_type, - struct nlmsghdr *n) -{ - int ret; - __be16 eth_type; - - if (matches(str, "ipv4") == 0) { - eth_type = htons(ETH_P_IP); - } else if (matches(str, "ipv6") == 0) { - eth_type = htons(ETH_P_IPV6); - } else { - __u16 tmp; - - ret = get_u16(&tmp, str, 16); - if (ret) - return -1; - eth_type = htons(tmp); - } - addattr16(n, MAX_MSG, type, eth_type); - *p_eth_type = eth_type; - return 0; -} - static int flower_parse_ip_proto(char *str, __be16 eth_type, int type, __u8 *p_ip_proto, struct nlmsghdr *n) { @@ -188,12 +165,9 @@ static int flower_parse_opt(struct filter_util *qu, char *handle, int ret; struct tcmsg *t = NLMSG_DATA(n); struct rtattr *tail; - __be16 eth_type = 0; + __be16 eth_type = TC_H_MIN(t->tcm_info); __u8 ip_proto = 0xff; - if (argc == 0) - return 0; - if (handle) { ret = get_u32(&t->tcm_handle, handle, 0); if (ret) { @@ -205,6 +179,11 @@ static int flower_parse_opt(struct filter_util *qu, char *handle, tail = (struct rtattr *) (((void *) n) + NLMSG_ALIGN(n->nlmsg_len)); addattr_l(n, MAX_MSG, TCA_OPTIONS, NULL, 0); + if (argc == 0) { + /*at minimal we will match all ethertype packets */ + goto parse_done; + } + while (argc > 0) { if (matches(*argv, "classid") == 0 || matches(*argv, "flowid") == 0) { @@ -244,15 +223,6 @@ static int flower_parse_opt(struct filter_util *qu, char *handle, fprintf(stderr, "Illegal \"src_mac\"\n"); return -1; } - } else if (matches(*argv, "eth_type") == 0) { - NEXT_ARG(); - ret = flower_parse_eth_type(*argv, - TCA_FLOWER_KEY_ETH_TYPE, - ð_type, n); - if (ret < 0) { - fprintf(stderr, "Illegal \"eth_type\"\n"); - return -1; - } } else if (matches(*argv, "ip_proto") == 0) { NEXT_ARG(); ret = flower_parse_ip_proto(*argv, eth_type, @@ -323,6 +293,14 @@ static int flower_parse_opt(struct filter_util *qu, char *handle, argc--; argv++; } +parse_done: + ret = addattr16(n, MAX_MSG, TCA_FLOWER_KEY_ETH_TYPE, eth_type); + if (ret) { + fprintf(stderr, "Illegal \"eth_type\"(0x%x)\n", + ntohs(eth_type)); + return -1; + } + tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail; return 0; @@ -489,7 +467,8 @@ static int flower_print_opt(struct filter_util *qu, FILE *f, if (tb[TCA_FLOWER_CLASSID]) { SPRINT_BUF(b1); fprintf(f, "classid %s ", - sprint_tc_classid(rta_getattr_u32(tb[TCA_FLOWER_CLASSID]), b1)); + sprint_tc_classid(rta_getattr_u32(tb[TCA_FLOWER_CLASSID]), + b1)); } if (tb[TCA_FLOWER_INDEV]) { From 19ec5f8393de1e7481bde8eb7a2b05f04f419458 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 11 Jan 2016 08:31:46 -0800 Subject: [PATCH 139/151] Revert "tipc: add peer remove functionality" This reverts commit d4585a4bb120e2f60b088a7e934bf2ae4e6b5b68. This commit is meant for later kernel. --- include/linux/tipc_netlink.h | 1 - man/man8/tipc-bearer.8 | 1 - man/man8/tipc-link.8 | 1 - man/man8/tipc-media.8 | 1 - man/man8/tipc-nametable.8 | 1 - man/man8/tipc-node.8 | 1 - man/man8/tipc-peer.8 | 52 -------------------- man/man8/tipc.8 | 1 - tipc/Makefile | 2 +- tipc/peer.c | 93 ------------------------------------ tipc/peer.h | 21 -------- tipc/tipc.c | 3 -- 12 files changed, 1 insertion(+), 177 deletions(-) delete mode 100644 man/man8/tipc-peer.8 delete mode 100644 tipc/peer.c delete mode 100644 tipc/peer.h diff --git a/include/linux/tipc_netlink.h b/include/linux/tipc_netlink.h index 25eb645e..d4c8f142 100644 --- a/include/linux/tipc_netlink.h +++ b/include/linux/tipc_netlink.h @@ -56,7 +56,6 @@ enum { TIPC_NL_NET_GET, TIPC_NL_NET_SET, TIPC_NL_NAME_TABLE_GET, - TIPC_NL_PEER_REMOVE, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 diff --git a/man/man8/tipc-bearer.8 b/man/man8/tipc-bearer.8 index 565ee01d..50a1ed24 100644 --- a/man/man8/tipc-bearer.8 +++ b/man/man8/tipc-bearer.8 @@ -218,7 +218,6 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-media (8), .BR tipc-nametable (8), .BR tipc-node (8), -.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-link.8 b/man/man8/tipc-link.8 index 2ee03a0b..3be8c9ad 100644 --- a/man/man8/tipc-link.8 +++ b/man/man8/tipc-link.8 @@ -213,7 +213,6 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-bearer (8), .BR tipc-nametable (8), .BR tipc-node (8), -.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-media.8 b/man/man8/tipc-media.8 index 4689cb3f..6c6e2b15 100644 --- a/man/man8/tipc-media.8 +++ b/man/man8/tipc-media.8 @@ -74,7 +74,6 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-link (8), .BR tipc-nametable (8), .BR tipc-node (8), -.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-nametable.8 b/man/man8/tipc-nametable.8 index 4bcefe47..d3397f97 100644 --- a/man/man8/tipc-nametable.8 +++ b/man/man8/tipc-nametable.8 @@ -87,7 +87,6 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-link (8), .BR tipc-media (8), .BR tipc-node (8), -.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-node.8 b/man/man8/tipc-node.8 index a72a4099..ef32ec7c 100644 --- a/man/man8/tipc-node.8 +++ b/man/man8/tipc-node.8 @@ -59,7 +59,6 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-link (8), .BR tipc-media (8), .BR tipc-nametable (8), -.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-peer.8 b/man/man8/tipc-peer.8 deleted file mode 100644 index 430651f7..00000000 --- a/man/man8/tipc-peer.8 +++ /dev/null @@ -1,52 +0,0 @@ -.TH TIPC-PEER 8 "04 Dec 2015" "iproute2" "Linux" - -.\" For consistency, please keep padding right aligned. -.\" For example '.B "foo " bar' and not '.B foo " bar"' - -.SH NAME -tipc-peer \- modify peer information - -.SH SYNOPSIS -.ad l -.in +8 - -.ti -8 -.B tipc peer remove address -.IR ADDRESS - -.SH OPTIONS -Options (flags) that can be passed anywhere in the command chain. -.TP -.BR "\-h" , " --help" -Show help about last valid command. For example -.B tipc peer --help -will show peer help and -.B tipc --help -will show general help. The position of the option in the string is irrelevant. -.SH DESCRIPTION - -.SS Peer remove -Remove an offline peer node from the local data structures. The peer is -identified by its -.B address - -.SH EXIT STATUS -Exit status is 0 if command was successful or a positive integer upon failure. - -.SH SEE ALSO -.BR tipc (8), -.BR tipc-bearer (8), -.BR tipc-link (8), -.BR tipc-media (8), -.BR tipc-nametable (8), -.BR tipc-node (8), -.BR tipc-socket (8) -.br -.SH REPORTING BUGS -Report any bugs to the Network Developers mailing list -.B -where the development and maintenance is primarily done. -You do not have to be subscribed to the list to send a message there. - -.SH AUTHOR -Richard Alpe diff --git a/man/man8/tipc.8 b/man/man8/tipc.8 index 32943fa5..c1165523 100644 --- a/man/man8/tipc.8 +++ b/man/man8/tipc.8 @@ -87,7 +87,6 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-media (8), .BR tipc-nametable (8), .BR tipc-node (8), -.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/tipc/Makefile b/tipc/Makefile index f06dcb11..bc5ecfd3 100644 --- a/tipc/Makefile +++ b/tipc/Makefile @@ -6,7 +6,7 @@ TIPCOBJ=bearer.o \ media.o misc.o \ msg.o nametable.o \ node.o socket.o \ - peer.o tipc.o + tipc.o include ../Config diff --git a/tipc/peer.c b/tipc/peer.c deleted file mode 100644 index de0c73c3..00000000 --- a/tipc/peer.c +++ /dev/null @@ -1,93 +0,0 @@ -/* - * peer.c TIPC peer functionality. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Authors: Richard Alpe - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "cmdl.h" -#include "msg.h" -#include "misc.h" -#include "peer.h" - -static int cmd_peer_rm_addr(struct nlmsghdr *nlh, const struct cmd *cmd, - struct cmdl *cmdl, void *data) -{ - char *str; - uint32_t addr; - struct nlattr *nest; - char buf[MNL_SOCKET_BUFFER_SIZE]; - - if ((cmdl->argc != cmdl->optind + 1) || help_flag) { - fprintf(stderr, "Usage: %s peer remove address ADDRESS\n", - cmdl->argv[0]); - return -EINVAL; - } - - str = shift_cmdl(cmdl); - addr = str2addr(str); - if (!addr) - return -1; - - if (!(nlh = msg_init(buf, TIPC_NL_PEER_REMOVE))) { - fprintf(stderr, "error, message initialisation failed\n"); - return -1; - } - - nest = mnl_attr_nest_start(nlh, TIPC_NLA_NET); - mnl_attr_put_u32(nlh, TIPC_NLA_NET_ADDR, addr); - mnl_attr_nest_end(nlh, nest); - - return msg_doit(nlh, NULL, NULL); -} - -static void cmd_peer_rm_help(struct cmdl *cmdl) -{ - fprintf(stderr, "Usage: %s peer remove address ADDRESS\n", - cmdl->argv[0]); -} - -static int cmd_peer_rm(struct nlmsghdr *nlh, const struct cmd *cmd, - struct cmdl *cmdl, void *data) -{ - const struct cmd cmds[] = { - { "address", cmd_peer_rm_addr, cmd_peer_rm_help }, - { NULL } - }; - - return run_cmd(nlh, cmd, cmds, cmdl, NULL); -} - -void cmd_peer_help(struct cmdl *cmdl) -{ - fprintf(stderr, - "Usage: %s peer COMMAND [ARGS] ...\n\n" - "COMMANDS\n" - " remove - Remove an offline peer node\n", - cmdl->argv[0]); -} - -int cmd_peer(struct nlmsghdr *nlh, const struct cmd *cmd, struct cmdl *cmdl, - void *data) -{ - const struct cmd cmds[] = { - { "remove", cmd_peer_rm, cmd_peer_rm_help }, - { NULL } - }; - - return run_cmd(nlh, cmd, cmds, cmdl, NULL); -} diff --git a/tipc/peer.h b/tipc/peer.h deleted file mode 100644 index 89722616..00000000 --- a/tipc/peer.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * peer.h TIPC peer functionality. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Authors: Richard Alpe - */ - -#ifndef _TIPC_PEER_H -#define _TIPC_PEER_H - -extern int help_flag; - -int cmd_peer(struct nlmsghdr *nlh, const struct cmd *cmd, struct cmdl *cmdl, - void *data); -void cmd_peer_help(struct cmdl *cmdl); - -#endif diff --git a/tipc/tipc.c b/tipc/tipc.c index 600d5e2a..44398052 100644 --- a/tipc/tipc.c +++ b/tipc/tipc.c @@ -20,7 +20,6 @@ #include "socket.h" #include "media.h" #include "node.h" -#include "peer.h" #include "cmdl.h" int help_flag; @@ -40,7 +39,6 @@ static void about(struct cmdl *cmdl) " media - Show or modify media\n" " nametable - Show nametable\n" " node - Show or modify node related parameters\n" - " peer - Peer related operations\n" " socket - Show sockets\n", cmdl->argv[0]); } @@ -61,7 +59,6 @@ int main(int argc, char *argv[]) { "media", cmd_media, cmd_media_help}, { "nametable", cmd_nametable, cmd_nametable_help}, { "node", cmd_node, cmd_node_help}, - { "peer", cmd_peer, cmd_peer_help}, { "socket", cmd_socket, cmd_socket_help}, { NULL } }; From 92a0236a3cdf3438000834121b7ea8a09f1f52b1 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 11 Jan 2016 08:33:03 -0800 Subject: [PATCH 140/151] v4.4.0 --- include/SNAPSHOT.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/SNAPSHOT.h b/include/SNAPSHOT.h index bd2ccea6..58d36327 100644 --- a/include/SNAPSHOT.h +++ b/include/SNAPSHOT.h @@ -1 +1 @@ -static const char SNAPSHOT[] = "151103"; +static const char SNAPSHOT[] = "160111"; From f9dec657e4578d50a2432e5842d97c857faa6c2c Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Tue, 5 Jan 2016 10:57:40 +0100 Subject: [PATCH 141/151] tipc: add peer remove functionality This enables a user to remove an offline peer from the kernel data structures. This could for example be useful when deliberately scaling in peer nodes in a cloud environment. Signed-off-by: Richard Alpe Reviewed-by: Jon Maloy Reviewed-by: Ying Xue --- include/linux/tipc_netlink.h | 1 + man/man8/tipc-bearer.8 | 1 + man/man8/tipc-link.8 | 1 + man/man8/tipc-media.8 | 1 + man/man8/tipc-nametable.8 | 1 + man/man8/tipc-node.8 | 1 + man/man8/tipc-peer.8 | 52 ++++++++++++++++++++ man/man8/tipc.8 | 1 + tipc/Makefile | 2 +- tipc/peer.c | 93 ++++++++++++++++++++++++++++++++++++ tipc/peer.h | 21 ++++++++ tipc/tipc.c | 3 ++ 12 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 man/man8/tipc-peer.8 create mode 100644 tipc/peer.c create mode 100644 tipc/peer.h diff --git a/include/linux/tipc_netlink.h b/include/linux/tipc_netlink.h index d4c8f142..25eb645e 100644 --- a/include/linux/tipc_netlink.h +++ b/include/linux/tipc_netlink.h @@ -56,6 +56,7 @@ enum { TIPC_NL_NET_GET, TIPC_NL_NET_SET, TIPC_NL_NAME_TABLE_GET, + TIPC_NL_PEER_REMOVE, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 diff --git a/man/man8/tipc-bearer.8 b/man/man8/tipc-bearer.8 index 50a1ed24..565ee01d 100644 --- a/man/man8/tipc-bearer.8 +++ b/man/man8/tipc-bearer.8 @@ -218,6 +218,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-media (8), .BR tipc-nametable (8), .BR tipc-node (8), +.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-link.8 b/man/man8/tipc-link.8 index 3be8c9ad..2ee03a0b 100644 --- a/man/man8/tipc-link.8 +++ b/man/man8/tipc-link.8 @@ -213,6 +213,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-bearer (8), .BR tipc-nametable (8), .BR tipc-node (8), +.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-media.8 b/man/man8/tipc-media.8 index 6c6e2b15..4689cb3f 100644 --- a/man/man8/tipc-media.8 +++ b/man/man8/tipc-media.8 @@ -74,6 +74,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-link (8), .BR tipc-nametable (8), .BR tipc-node (8), +.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-nametable.8 b/man/man8/tipc-nametable.8 index d3397f97..4bcefe47 100644 --- a/man/man8/tipc-nametable.8 +++ b/man/man8/tipc-nametable.8 @@ -87,6 +87,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-link (8), .BR tipc-media (8), .BR tipc-node (8), +.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-node.8 b/man/man8/tipc-node.8 index ef32ec7c..a72a4099 100644 --- a/man/man8/tipc-node.8 +++ b/man/man8/tipc-node.8 @@ -59,6 +59,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-link (8), .BR tipc-media (8), .BR tipc-nametable (8), +.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/man/man8/tipc-peer.8 b/man/man8/tipc-peer.8 new file mode 100644 index 00000000..430651f7 --- /dev/null +++ b/man/man8/tipc-peer.8 @@ -0,0 +1,52 @@ +.TH TIPC-PEER 8 "04 Dec 2015" "iproute2" "Linux" + +.\" For consistency, please keep padding right aligned. +.\" For example '.B "foo " bar' and not '.B foo " bar"' + +.SH NAME +tipc-peer \- modify peer information + +.SH SYNOPSIS +.ad l +.in +8 + +.ti -8 +.B tipc peer remove address +.IR ADDRESS + +.SH OPTIONS +Options (flags) that can be passed anywhere in the command chain. +.TP +.BR "\-h" , " --help" +Show help about last valid command. For example +.B tipc peer --help +will show peer help and +.B tipc --help +will show general help. The position of the option in the string is irrelevant. +.SH DESCRIPTION + +.SS Peer remove +Remove an offline peer node from the local data structures. The peer is +identified by its +.B address + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR tipc (8), +.BR tipc-bearer (8), +.BR tipc-link (8), +.BR tipc-media (8), +.BR tipc-nametable (8), +.BR tipc-node (8), +.BR tipc-socket (8) +.br +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Richard Alpe diff --git a/man/man8/tipc.8 b/man/man8/tipc.8 index c1165523..32943fa5 100644 --- a/man/man8/tipc.8 +++ b/man/man8/tipc.8 @@ -87,6 +87,7 @@ Exit status is 0 if command was successful or a positive integer upon failure. .BR tipc-media (8), .BR tipc-nametable (8), .BR tipc-node (8), +.BR tipc-peer (8), .BR tipc-socket (8) .br .SH REPORTING BUGS diff --git a/tipc/Makefile b/tipc/Makefile index bc5ecfd3..f06dcb11 100644 --- a/tipc/Makefile +++ b/tipc/Makefile @@ -6,7 +6,7 @@ TIPCOBJ=bearer.o \ media.o misc.o \ msg.o nametable.o \ node.o socket.o \ - tipc.o + peer.o tipc.o include ../Config diff --git a/tipc/peer.c b/tipc/peer.c new file mode 100644 index 00000000..de0c73c3 --- /dev/null +++ b/tipc/peer.c @@ -0,0 +1,93 @@ +/* + * peer.c TIPC peer functionality. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Richard Alpe + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "cmdl.h" +#include "msg.h" +#include "misc.h" +#include "peer.h" + +static int cmd_peer_rm_addr(struct nlmsghdr *nlh, const struct cmd *cmd, + struct cmdl *cmdl, void *data) +{ + char *str; + uint32_t addr; + struct nlattr *nest; + char buf[MNL_SOCKET_BUFFER_SIZE]; + + if ((cmdl->argc != cmdl->optind + 1) || help_flag) { + fprintf(stderr, "Usage: %s peer remove address ADDRESS\n", + cmdl->argv[0]); + return -EINVAL; + } + + str = shift_cmdl(cmdl); + addr = str2addr(str); + if (!addr) + return -1; + + if (!(nlh = msg_init(buf, TIPC_NL_PEER_REMOVE))) { + fprintf(stderr, "error, message initialisation failed\n"); + return -1; + } + + nest = mnl_attr_nest_start(nlh, TIPC_NLA_NET); + mnl_attr_put_u32(nlh, TIPC_NLA_NET_ADDR, addr); + mnl_attr_nest_end(nlh, nest); + + return msg_doit(nlh, NULL, NULL); +} + +static void cmd_peer_rm_help(struct cmdl *cmdl) +{ + fprintf(stderr, "Usage: %s peer remove address ADDRESS\n", + cmdl->argv[0]); +} + +static int cmd_peer_rm(struct nlmsghdr *nlh, const struct cmd *cmd, + struct cmdl *cmdl, void *data) +{ + const struct cmd cmds[] = { + { "address", cmd_peer_rm_addr, cmd_peer_rm_help }, + { NULL } + }; + + return run_cmd(nlh, cmd, cmds, cmdl, NULL); +} + +void cmd_peer_help(struct cmdl *cmdl) +{ + fprintf(stderr, + "Usage: %s peer COMMAND [ARGS] ...\n\n" + "COMMANDS\n" + " remove - Remove an offline peer node\n", + cmdl->argv[0]); +} + +int cmd_peer(struct nlmsghdr *nlh, const struct cmd *cmd, struct cmdl *cmdl, + void *data) +{ + const struct cmd cmds[] = { + { "remove", cmd_peer_rm, cmd_peer_rm_help }, + { NULL } + }; + + return run_cmd(nlh, cmd, cmds, cmdl, NULL); +} diff --git a/tipc/peer.h b/tipc/peer.h new file mode 100644 index 00000000..89722616 --- /dev/null +++ b/tipc/peer.h @@ -0,0 +1,21 @@ +/* + * peer.h TIPC peer functionality. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Richard Alpe + */ + +#ifndef _TIPC_PEER_H +#define _TIPC_PEER_H + +extern int help_flag; + +int cmd_peer(struct nlmsghdr *nlh, const struct cmd *cmd, struct cmdl *cmdl, + void *data); +void cmd_peer_help(struct cmdl *cmdl); + +#endif diff --git a/tipc/tipc.c b/tipc/tipc.c index 44398052..600d5e2a 100644 --- a/tipc/tipc.c +++ b/tipc/tipc.c @@ -20,6 +20,7 @@ #include "socket.h" #include "media.h" #include "node.h" +#include "peer.h" #include "cmdl.h" int help_flag; @@ -39,6 +40,7 @@ static void about(struct cmdl *cmdl) " media - Show or modify media\n" " nametable - Show nametable\n" " node - Show or modify node related parameters\n" + " peer - Peer related operations\n" " socket - Show sockets\n", cmdl->argv[0]); } @@ -59,6 +61,7 @@ int main(int argc, char *argv[]) { "media", cmd_media, cmd_media_help}, { "nametable", cmd_nametable, cmd_nametable_help}, { "node", cmd_node, cmd_node_help}, + { "peer", cmd_peer, cmd_peer_help}, { "socket", cmd_socket, cmd_socket_help}, { NULL } }; From bc223ab8617720902d75d855a702273dc6d232c8 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 18 Jan 2016 09:37:38 -0800 Subject: [PATCH 142/151] Revert "tc: fix compilation with old gcc (< 4.6)" This reverts commit 8f80d450c3cb0996d839996807b77ca28bd4da09. --- tc/tc_bpf.c | 48 +++++++++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c index 47993bad..276871a5 100644 --- a/tc/tc_bpf.c +++ b/tc/tc_bpf.c @@ -257,14 +257,12 @@ static bool bpf_may_skip_map_creation(int file_fd) static int bpf_create_map(enum bpf_map_type type, unsigned int size_key, unsigned int size_value, unsigned int max_elem) { - union bpf_attr attr; - - memset(&attr, 0, sizeof(attr)); - - attr.map_type = type; - attr.key_size = size_key; - attr.value_size = size_value; - attr.max_entries = max_elem; + union bpf_attr attr = { + .map_type = type, + .key_size = size_key, + .value_size = size_value, + .max_entries = max_elem, + }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } @@ -272,14 +270,12 @@ static int bpf_create_map(enum bpf_map_type type, unsigned int size_key, static int bpf_update_map(int fd, const void *key, const void *value, uint64_t flags) { - union bpf_attr attr; - - memset(&attr, 0, sizeof(attr)); - - attr.map_fd = fd; - attr.key = bpf_ptr_to_u64(key); - attr.value = bpf_ptr_to_u64(value); - attr.flags = flags; + union bpf_attr attr = { + .map_fd = fd, + .key = bpf_ptr_to_u64(key), + .value = bpf_ptr_to_u64(value), + .flags = flags, + }; return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); } @@ -287,17 +283,15 @@ static int bpf_update_map(int fd, const void *key, const void *value, static int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns, unsigned int len, const char *license) { - union bpf_attr attr; - - memset(&attr, 0, sizeof(attr)); - - attr.prog_type = type; - attr.insns = bpf_ptr_to_u64(insns); - attr.insn_cnt = len / sizeof(struct bpf_insn); - attr.license = bpf_ptr_to_u64(license); - attr.log_buf = bpf_ptr_to_u64(bpf_log_buf); - attr.log_size = sizeof(bpf_log_buf); - attr.log_level = 1; + union bpf_attr attr = { + .prog_type = type, + .insns = bpf_ptr_to_u64(insns), + .insn_cnt = len / sizeof(struct bpf_insn), + .license = bpf_ptr_to_u64(license), + .log_buf = bpf_ptr_to_u64(bpf_log_buf), + .log_size = sizeof(bpf_log_buf), + .log_level = 1, + }; return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } From 7321b7db6f0648d231cb7b05671682d89f07aff8 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 18 Jan 2016 09:40:13 -0800 Subject: [PATCH 143/151] update headers (post 4.4 merge window) --- include/linux/bpf.h | 26 +++++++++++++++++++++++++- include/linux/pkt_sched.h | 4 ++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 39e7f33c..f970f9db 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -273,6 +273,25 @@ enum bpf_func_id { __BPF_FUNC_MAX_ID, }; +/* All flags used by eBPF helper functions, placed here. */ + +/* BPF_FUNC_skb_store_bytes flags. */ +#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) + +/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. + * First 4 bits are for passing the header field size. + */ +#define BPF_F_HDR_FIELD_MASK 0xfULL + +/* BPF_FUNC_l4_csum_replace flags. */ +#define BPF_F_PSEUDO_HDR (1ULL << 4) + +/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ +#define BPF_F_INGRESS (1ULL << 0) + +/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ +#define BPF_F_TUNINFO_IPV6 (1ULL << 0) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -296,7 +315,12 @@ struct __sk_buff { struct bpf_tunnel_key { __u32 tunnel_id; - __u32 remote_ipv4; + union { + __u32 remote_ipv4; + __u32 remote_ipv6[4]; + }; + __u8 tunnel_tos; + __u8 tunnel_ttl; }; #endif /* __LINUX_BPF_H__ */ diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 8d2530da..8cb18b44 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -72,6 +72,10 @@ struct tc_estimator { #define TC_H_UNSPEC (0U) #define TC_H_ROOT (0xFFFFFFFFU) #define TC_H_INGRESS (0xFFFFFFF1U) +#define TC_H_CLSACT TC_H_INGRESS + +#define TC_H_MIN_INGRESS 0xFFF2U +#define TC_H_MIN_EGRESS 0xFFF3U /* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ enum tc_link_layer { From 0d45c4b420375a5c71d5af08ca4374c9f775372e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Jan 2016 01:42:19 +0100 Subject: [PATCH 144/151] tc, ingress: clean up ingress handling a bit Clean it up a bit, we can also get rid of some ugly ifdefs as in our case TC_H_INGRESS is always defined. Signed-off-by: Daniel Borkmann --- tc/q_ingress.c | 20 +++++--------------- tc/tc_qdisc.c | 11 +++-------- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/tc/q_ingress.c b/tc/q_ingress.c index 30b24e7d..c3c9b403 100644 --- a/tc/q_ingress.c +++ b/tc/q_ingress.c @@ -1,5 +1,4 @@ /* - * * q_ingress.c INGRESS. * * This program is free software; you can redistribute it and/or @@ -8,20 +7,9 @@ * 2 of the License, or (at your option) any later version. * * Authors: J Hadi Salim - * - * This is here just in case it is needed - * useless right now; might be useful in the future - * */ #include -#include -#include -#include -#include -#include -#include -#include #include #include "utils.h" @@ -29,10 +17,11 @@ static void explain(void) { - fprintf(stderr, "Usage: ... ingress \n"); + fprintf(stderr, "Usage: ... ingress\n"); } -static int ingress_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +static int ingress_parse_opt(struct qdisc_util *qu, int argc, char **argv, + struct nlmsghdr *n) { while (argc > 0) { if (strcmp(*argv, "handle") == 0) { @@ -49,7 +38,8 @@ static int ingress_parse_opt(struct qdisc_util *qu, int argc, char **argv, struc return 0; } -static int ingress_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +static int ingress_print_opt(struct qdisc_util *qu, FILE *f, + struct rtattr *opt) { fprintf(f, "---------------- "); return 0; diff --git a/tc/tc_qdisc.c b/tc/tc_qdisc.c index c31ae8d2..96b80852 100644 --- a/tc/tc_qdisc.c +++ b/tc/tc_qdisc.c @@ -91,20 +91,17 @@ static int tc_qdisc_modify(int cmd, unsigned flags, int argc, char **argv) return -1; } req.t.tcm_parent = TC_H_ROOT; -#ifdef TC_H_INGRESS } else if (strcmp(*argv, "ingress") == 0) { if (req.t.tcm_parent) { fprintf(stderr, "Error: \"ingress\" is a duplicate parent ID\n"); return -1; } req.t.tcm_parent = TC_H_INGRESS; - strncpy(k, "ingress", sizeof(k)-1); + strncpy(k, "ingress", sizeof(k) - 1); q = get_qdisc_kind(k); - req.t.tcm_handle = 0xffff0000; - - argc--; argv++; + req.t.tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); + NEXT_ARG_FWD(); break; -#endif } else if (strcmp(*argv, "parent") == 0) { __u32 handle; NEXT_ARG(); @@ -291,14 +288,12 @@ static int tc_qdisc_list(int argc, char **argv) if (strcmp(*argv, "dev") == 0) { NEXT_ARG(); strncpy(d, *argv, sizeof(d)-1); -#ifdef TC_H_INGRESS } else if (strcmp(*argv, "ingress") == 0) { if (t.tcm_parent) { fprintf(stderr, "Duplicate parent ID\n"); usage(); } t.tcm_parent = TC_H_INGRESS; -#endif } else if (matches(*argv, "help") == 0) { usage(); } else { From 8f9afdd531560c1534be44424669add2e19deeec Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Jan 2016 01:42:20 +0100 Subject: [PATCH 145/151] tc, clsact: add clsact frontend Add the tc part for the kernel commit 1f211a1b929c ("net, sched: add clsact qdisc"). Quoting example usage from that commit description: Example, adding qdisc: # tc qdisc add dev foo clsact # tc qdisc show dev foo qdisc mq 0: root qdisc pfifo_fast 0: parent :1 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :2 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :3 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :4 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc clsact ffff: parent ffff:fff1 Adding filters (deleting, etc works analogous by specifying ingress/egress): # tc filter add dev foo ingress bpf da obj bar.o sec ingress # tc filter add dev foo egress bpf da obj bar.o sec egress # tc filter show dev foo ingress filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bar.o:[ingress] direct-action # tc filter show dev foo egress filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bar.o:[egress] direct-action The ingress parent alias can also be used with ingress qdisc. Signed-off-by: Daniel Borkmann --- tc/Makefile | 1 + tc/q_clsact.c | 34 ++++++++++++++++++++++++++++++++++ tc/tc_filter.c | 46 ++++++++++++++++++++++++++++++++++++++-------- tc/tc_qdisc.c | 21 +++++++++++++++------ 4 files changed, 88 insertions(+), 14 deletions(-) create mode 100644 tc/q_clsact.c diff --git a/tc/Makefile b/tc/Makefile index 56acbaa1..f5bea877 100644 --- a/tc/Makefile +++ b/tc/Makefile @@ -64,6 +64,7 @@ TCMODULES += q_fq_codel.o TCMODULES += q_fq.o TCMODULES += q_pie.o TCMODULES += q_hhf.o +TCMODULES += q_clsact.o TCMODULES += e_bpf.o ifeq ($(TC_CONFIG_IPSET), y) diff --git a/tc/q_clsact.c b/tc/q_clsact.c new file mode 100644 index 00000000..0c05dbd3 --- /dev/null +++ b/tc/q_clsact.c @@ -0,0 +1,34 @@ +#include +#include + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... clsact\n"); +} + +static int clsact_parse_opt(struct qdisc_util *qu, int argc, char **argv, + struct nlmsghdr *n) +{ + if (argc > 0) { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + + addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); + return 0; +} + +static int clsact_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + return 0; +} + +struct qdisc_util clsact_qdisc_util = { + .id = "clsact", + .parse_qopt = clsact_parse_opt, + .print_qopt = clsact_print_opt, +}; diff --git a/tc/tc_filter.c b/tc/tc_filter.c index ff03db8f..1a1082b4 100644 --- a/tc/tc_filter.c +++ b/tc/tc_filter.c @@ -26,25 +26,21 @@ #include "tc_util.h" #include "tc_common.h" -static void usage(void); - static void usage(void) { fprintf(stderr, "Usage: tc filter [ add | del | change | replace | show ] dev STRING\n"); fprintf(stderr, " [ pref PRIO ] protocol PROTO\n"); fprintf(stderr, " [ estimator INTERVAL TIME_CONSTANT ]\n"); - fprintf(stderr, " [ root | classid CLASSID ] [ handle FILTERID ]\n"); - fprintf(stderr, " [ [ FILTER_TYPE ] [ help | OPTIONS ] ]\n"); + fprintf(stderr, " [ root | ingress | egress | parent CLASSID ]\n"); + fprintf(stderr, " [ handle FILTERID ] [ [ FILTER_TYPE ] [ help | OPTIONS ] ]\n"); fprintf(stderr, "\n"); - fprintf(stderr, " tc filter show [ dev STRING ] [ root | parent CLASSID ]\n"); + fprintf(stderr, " tc filter show [ dev STRING ] [ root | ingress | egress | parent CLASSID ]\n"); fprintf(stderr, "Where:\n"); fprintf(stderr, "FILTER_TYPE := { rsvp | u32 | bpf | fw | route | etc. }\n"); fprintf(stderr, "FILTERID := ... format depends on classifier, see there\n"); fprintf(stderr, "OPTIONS := ... try tc filter add help\n"); - return; } - static int tc_filter_modify(int cmd, unsigned flags, int argc, char **argv) { struct { @@ -87,6 +83,20 @@ static int tc_filter_modify(int cmd, unsigned flags, int argc, char **argv) return -1; } req.t.tcm_parent = TC_H_ROOT; + } else if (strcmp(*argv, "ingress") == 0) { + if (req.t.tcm_parent) { + fprintf(stderr, "Error: \"ingress\" is duplicate parent ID\n"); + return -1; + } + req.t.tcm_parent = TC_H_MAKE(TC_H_CLSACT, + TC_H_MIN_INGRESS); + } else if (strcmp(*argv, "egress") == 0) { + if (req.t.tcm_parent) { + fprintf(stderr, "Error: \"egress\" is duplicate parent ID\n"); + return -1; + } + req.t.tcm_parent = TC_H_MAKE(TC_H_CLSACT, + TC_H_MIN_EGRESS); } else if (strcmp(*argv, "parent") == 0) { __u32 handle; NEXT_ARG(); @@ -220,11 +230,16 @@ int print_filter(const struct sockaddr_nl *who, if (!filter_parent || filter_parent != t->tcm_parent) { if (t->tcm_parent == TC_H_ROOT) fprintf(fp, "root "); + else if (t->tcm_parent == TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)) + fprintf(fp, "ingress "); + else if (t->tcm_parent == TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)) + fprintf(fp, "egress "); else { print_tc_classid(abuf, sizeof(abuf), t->tcm_parent); fprintf(fp, "parent %s ", abuf); } } + if (t->tcm_info) { f_proto = TC_H_MIN(t->tcm_info); __u32 prio = TC_H_MAJ(t->tcm_info)>>16; @@ -259,7 +274,6 @@ int print_filter(const struct sockaddr_nl *who, return 0; } - static int tc_filter_list(int argc, char **argv) { struct tcmsg t; @@ -284,6 +298,22 @@ static int tc_filter_list(int argc, char **argv) return -1; } filter_parent = t.tcm_parent = TC_H_ROOT; + } else if (strcmp(*argv, "ingress") == 0) { + if (t.tcm_parent) { + fprintf(stderr, "Error: \"ingress\" is duplicate parent ID\n"); + return -1; + } + filter_parent = TC_H_MAKE(TC_H_CLSACT, + TC_H_MIN_INGRESS); + t.tcm_parent = filter_parent; + } else if (strcmp(*argv, "egress") == 0) { + if (t.tcm_parent) { + fprintf(stderr, "Error: \"egress\" is duplicate parent ID\n"); + return -1; + } + filter_parent = TC_H_MAKE(TC_H_CLSACT, + TC_H_MIN_EGRESS); + t.tcm_parent = filter_parent; } else if (strcmp(*argv, "parent") == 0) { __u32 handle; NEXT_ARG(); diff --git a/tc/tc_qdisc.c b/tc/tc_qdisc.c index 96b80852..cb861e08 100644 --- a/tc/tc_qdisc.c +++ b/tc/tc_qdisc.c @@ -26,17 +26,15 @@ #include "tc_util.h" #include "tc_common.h" -static int usage(void); - static int usage(void) { fprintf(stderr, "Usage: tc qdisc [ add | del | replace | change | show ] dev STRING\n"); - fprintf(stderr, " [ handle QHANDLE ] [ root | ingress | parent CLASSID ]\n"); + fprintf(stderr, " [ handle QHANDLE ] [ root | ingress | clsact | parent CLASSID ]\n"); fprintf(stderr, " [ estimator INTERVAL TIME_CONSTANT ]\n"); fprintf(stderr, " [ stab [ help | STAB_OPTIONS] ]\n"); fprintf(stderr, " [ [ QDISC_KIND ] [ help | OPTIONS ] ]\n"); fprintf(stderr, "\n"); - fprintf(stderr, " tc qdisc show [ dev STRING ] [ingress]\n"); + fprintf(stderr, " tc qdisc show [ dev STRING ] [ ingress | clsact ]\n"); fprintf(stderr, "Where:\n"); fprintf(stderr, "QDISC_KIND := { [p|b]fifo | tbf | prio | cbq | red | etc. }\n"); fprintf(stderr, "OPTIONS := ... try tc qdisc add help\n"); @@ -91,6 +89,17 @@ static int tc_qdisc_modify(int cmd, unsigned flags, int argc, char **argv) return -1; } req.t.tcm_parent = TC_H_ROOT; + } else if (strcmp(*argv, "clsact") == 0) { + if (req.t.tcm_parent) { + fprintf(stderr, "Error: \"clsact\" is a duplicate parent ID\n"); + return -1; + } + req.t.tcm_parent = TC_H_CLSACT; + strncpy(k, "clsact", sizeof(k) - 1); + q = get_qdisc_kind(k); + req.t.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0); + NEXT_ARG_FWD(); + break; } else if (strcmp(*argv, "ingress") == 0) { if (req.t.tcm_parent) { fprintf(stderr, "Error: \"ingress\" is a duplicate parent ID\n"); @@ -274,7 +283,6 @@ int print_qdisc(const struct sockaddr_nl *who, return 0; } - static int tc_qdisc_list(int argc, char **argv) { struct tcmsg t; @@ -288,7 +296,8 @@ static int tc_qdisc_list(int argc, char **argv) if (strcmp(*argv, "dev") == 0) { NEXT_ARG(); strncpy(d, *argv, sizeof(d)-1); - } else if (strcmp(*argv, "ingress") == 0) { + } else if (strcmp(*argv, "ingress") == 0 || + strcmp(*argv, "clsact") == 0) { if (t.tcm_parent) { fprintf(stderr, "Duplicate parent ID\n"); usage(); From cce3d4664c6bc839116e504183f9caebe6994120 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Jan 2016 02:03:07 +0100 Subject: [PATCH 146/151] tc, bpf: check section names and type everywhere When extracting sections, we better check for name and type. Noticed that some llvm versions emit .strtab and .shstrtab (e.g. saw it on pre 3.7), while more recent ones only seem to emit .strtab. Thus, make sure we get the right sections. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- tc/tc_bpf.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c index f9b2b007..677dd628 100644 --- a/tc/tc_bpf.c +++ b/tc/tc_bpf.c @@ -1237,14 +1237,17 @@ static int bpf_fetch_ancillary(struct bpf_elf_ctx *ctx) if (ret < 0) continue; - if (!strcmp(data.sec_name, ELF_SECTION_MAPS)) + if (data.sec_hdr.sh_type == SHT_PROGBITS && + !strcmp(data.sec_name, ELF_SECTION_MAPS)) ret = bpf_fetch_maps(ctx, i, &data); - else if (!strcmp(data.sec_name, ELF_SECTION_LICENSE)) + else if (data.sec_hdr.sh_type == SHT_PROGBITS && + !strcmp(data.sec_name, ELF_SECTION_LICENSE)) ret = bpf_fetch_license(ctx, i, &data); - else if (data.sec_hdr.sh_type == SHT_SYMTAB) + else if (data.sec_hdr.sh_type == SHT_SYMTAB && + !strcmp(data.sec_name, ".symtab")) ret = bpf_fetch_symtab(ctx, i, &data); else if (data.sec_hdr.sh_type == SHT_STRTAB && - i != ctx->elf_hdr.e_shstrndx) + !strcmp(data.sec_name, ".strtab")) ret = bpf_fetch_strtab(ctx, i, &data); if (ret < 0) { fprintf(stderr, "Error parsing section %d! Perhaps" @@ -1275,7 +1278,10 @@ static int bpf_fetch_prog(struct bpf_elf_ctx *ctx, const char *section) continue; ret = bpf_fill_section_data(ctx, i, &data); - if (ret < 0 || strcmp(data.sec_name, section)) + if (ret < 0 || + !(data.sec_hdr.sh_type == SHT_PROGBITS && + data.sec_hdr.sh_flags & SHF_EXECINSTR && + !strcmp(data.sec_name, section))) continue; memset(&prog, 0, sizeof(prog)); @@ -1353,7 +1359,10 @@ static int bpf_fetch_prog_relo(struct bpf_elf_ctx *ctx, const char *section) idx = data_relo.sec_hdr.sh_info; ret = bpf_fill_section_data(ctx, idx, &data_insn); - if (ret < 0 || strcmp(data_insn.sec_name, section)) + if (ret < 0 || + !(data_insn.sec_hdr.sh_type == SHT_PROGBITS && + data_insn.sec_hdr.sh_flags & SHF_EXECINSTR && + !strcmp(data_insn.sec_name, section))) continue; ret = bpf_apply_relo_data(ctx, &data_relo, &data_insn); From 8187b012731cf2699c0abd5c88673bdaebca53b2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Jan 2016 02:03:08 +0100 Subject: [PATCH 147/151] tc, bpf: more header checks on loading elf eBPF llvm backend can support different BPF formats, make sure the object we're trying to load matches with regards to endiannes and while at it, also check for other attributes related to BPF ELFs. # llc --version LLVM (http://llvm.org/): LLVM version 3.8.0svn Optimized build. Built Jan 9 2016 (02:08:10). Default target: x86_64-unknown-linux-gnu Host CPU: ivybridge Registered Targets: bpf - BPF (host endian) bpfeb - BPF (big endian) bpfel - BPF (little endian) [...] Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- tc/tc_bpf.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c index 677dd628..42c88418 100644 --- a/tc/tc_bpf.c +++ b/tc/tc_bpf.c @@ -39,6 +39,8 @@ #include #include +#include + #include "utils.h" #include "bpf_elf.h" @@ -1564,6 +1566,38 @@ static void bpf_hash_destroy(struct bpf_elf_ctx *ctx) } } +static int bpf_elf_check_ehdr(const struct bpf_elf_ctx *ctx) +{ + if (ctx->elf_hdr.e_type != ET_REL || + ctx->elf_hdr.e_machine != 0 || + ctx->elf_hdr.e_version != EV_CURRENT) { + fprintf(stderr, "ELF format error, ELF file not for eBPF?\n"); + return -EINVAL; + } + + switch (ctx->elf_hdr.e_ident[EI_DATA]) { + default: + fprintf(stderr, "ELF format error, wrong endianness info?\n"); + return -EINVAL; + case ELFDATA2LSB: + if (htons(1) == 1) { + fprintf(stderr, + "We are big endian, eBPF object is little endian!\n"); + return -EIO; + } + break; + case ELFDATA2MSB: + if (htons(1) != 1) { + fprintf(stderr, + "We are little endian, eBPF object is big endian!\n"); + return -EIO; + } + break; + } + + return 0; +} + static int bpf_elf_ctx_init(struct bpf_elf_ctx *ctx, const char *pathname, enum bpf_prog_type type, bool verbose) { @@ -1587,12 +1621,21 @@ static int bpf_elf_ctx_init(struct bpf_elf_ctx *ctx, const char *pathname, goto out_fd; } + if (elf_kind(ctx->elf_fd) != ELF_K_ELF) { + ret = -EINVAL; + goto out_fd; + } + if (gelf_getehdr(ctx->elf_fd, &ctx->elf_hdr) != &ctx->elf_hdr) { ret = -EIO; goto out_elf; } + ret = bpf_elf_check_ehdr(ctx); + if (ret < 0) + goto out_elf; + ctx->sec_done = calloc(ctx->elf_hdr.e_shnum, sizeof(*(ctx->sec_done))); if (!ctx->sec_done) { From 5cd64c979f97ac1590e7bf28ae9b1adbd7673d3a Mon Sep 17 00:00:00 2001 From: Thomas Faivre Date: Thu, 14 Jan 2016 18:10:19 +0100 Subject: [PATCH 148/151] vxlan: fix help and man text Options 'group' and 'remote' cannot take 'any' as value but 'local' can. Signed-off-by: Thomas Faivre Signed-off-by: Nicolas Dichtel --- ip/iplink_vxlan.c | 2 +- man/man8/ip-link.8.in | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c index aa4d5198..ede84824 100644 --- a/ip/iplink_vxlan.c +++ b/ip/iplink_vxlan.c @@ -23,7 +23,7 @@ static void print_explain(FILE *f) { - fprintf(f, "Usage: ... vxlan id VNI [ { group | remote } ADDR ] [ local ADDR ]\n"); + fprintf(f, "Usage: ... vxlan id VNI [ { group | remote } IP_ADDRESS ] [ local ADDR ]\n"); fprintf(f, " [ ttl TTL ] [ tos TOS ] [ dev PHYS_DEV ]\n"); fprintf(f, " [ dstport PORT ] [ srcport MIN MAX ]\n"); fprintf(f, " [ [no]learning ] [ [no]proxy ] [ [no]rsc ]\n"); diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 189a8f15..36404964 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -391,7 +391,8 @@ the following additional arguments are supported: .RB " ] [ { " group " | " remote " } " .I IPADDR .R " ] [ " -.BI local " IPADDR " +.B local +.RI "{ "IPADDR " | "any " } " .R " ] [ " .BI ttl " TTL " .R " ] [ " From 1ab0f02f465b7bcc09798645d19236a500742e90 Mon Sep 17 00:00:00 2001 From: Thomas Faivre Date: Thu, 14 Jan 2016 18:10:20 +0100 Subject: [PATCH 149/151] ip-link: fix man page warnings grff wrapper returns warnings when parsing the ip-link.8.in file. How to reproduce: $ man --warnings ip-link > /dev/null `R' is a string (producing the registered sign), not a macro. [...] Signed-off-by: Thomas Faivre Signed-off-by: Nicolas Dichtel --- man/man8/ip-link.8.in | 102 +++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 36404964..4d323435 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -284,28 +284,28 @@ the following additional arguments are supported: .BI link " DEVICE " .BI name " NAME " .BI type " vlan " -.R " [ " +[ .BI protocol " VLAN_PROTO " -.R " ] " +] .BI id " VLANID " -.R " [ " +[ .BR reorder_hdr " { " on " | " off " } " -.R " ] " -.R " [ " +] +[ .BR gvrp " { " on " | " off " } " -.R " ] " -.R " [ " +] +[ .BR mvrp " { " on " | " off " } " -.R " ] " -.R " [ " +] +[ .BR loose_binding " { " on " | " off " } " -.R " ] " -.R " [ " +] +[ .BI ingress-qos-map " QOS-MAP " -.R " ] " -.R " [ " +] +[ .BI egress-qos-map " QOS-MAP " -.R " ] " +] .in +8 .sp @@ -386,44 +386,44 @@ the following additional arguments are supported: .BI "ip link add " DEVICE .BI type " vxlan " id " ID" -.R " [ " +[ .BI dev " PHYS_DEV " .RB " ] [ { " group " | " remote " } " .I IPADDR -.R " ] [ " +] [ .B local .RI "{ "IPADDR " | "any " } " -.R " ] [ " +] [ .BI ttl " TTL " -.R " ] [ " +] [ .BI tos " TOS " -.R " ] [ " +] [ .BI dstport " PORT " -.R " ] [ " +] [ .BI srcport " MIN MAX " -.R " ] [ " +] [ .I "[no]learning " -.R " ] [ " +] [ .I "[no]proxy " -.R " ] [ " +] [ .I "[no]rsc " -.R " ] [ " +] [ .I "[no]l2miss " -.R " ] [ " +] [ .I "[no]l3miss " -.R " ] [ " +] [ .I "[no]udpcsum " -.R " ] [ " +] [ .I "[no]udp6zerocsumtx " -.R " ] [ " +] [ .I "[no]udp6zerocsumrx " -.R " ] [ " +] [ .BI ageing " SECONDS " -.R " ] [ " +] [ .BI maxaddress " NUMBER " -.R " ] [ " +] [ .B gbp -.R " ]" +] .in +8 .sp @@ -565,17 +565,17 @@ the following additional arguments are supported: .BI "ip link add " DEVICE .BR type " { gre | ipip | sit } " .BI " remote " ADDR " local " ADDR -.R " [ " +[ .BR encap " { fou | gue | none } " -.R " ] [ " +] [ .BI "encap-sport { " PORT " | auto } " -.R " ] [ " +] [ .BI "encap-dport " PORT -.R " ] [ " +] [ .I " [no]encap-csum " -.R " ] [ " +] [ .I " [no]encap-remcsum " -.R " ]" +] .in +8 .sp @@ -621,25 +621,25 @@ the following additional arguments are supported: .BI "ip link add " DEVICE .BI type " { ip6gre | ip6gretap } " remote " ADDR " local " ADDR -.R " [ " +[ .I "[i|o]seq]" -.R " ] [ " +] [ .I "[i|o]key" KEY -.R " ] [ " +] [ .I " [i|o]csum " -.R " ] [ " +] [ .BI hoplimit " TTL " -.R " ] [ " +] [ .BI encaplimit " ELIM " -.R " ] [ " +] [ .BI tclass " TCLASS " -.R " ] [ " +] [ .BI flowlabel " FLOWLABEL " -.R " ] [ " +] [ .BI "dscp inherit" -.R " ] [ " +] [ .BI dev " PHYS_DEV " -.R " ]" +] .in +8 .sp @@ -744,11 +744,11 @@ the following additional arguments are supported: .BI "ip link add " DEVICE .BI type " geneve " id " ID " remote " IPADDR" -.R " [ " +[ .BI ttl " TTL " -.R " ] [ " +] [ .BI tos " TOS " -.R " ]" +] .in +8 .sp From 57fdf2d4d94aeee493214d455b8f3336f09afa09 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 8 Jan 2016 17:32:36 +0900 Subject: [PATCH 150/151] libnetlink: don't print NETLINK_SOCK_DIAG errors in rtnl_talk This change is a no-op, as currently no code uses rtnl_talk on NETLINK_SOCK_DIAG_BY_FAMILY sockets. It is needed to suppress spurious errors when using SOCK_DESTROY via rtnl_talk. Signed-off-by: Lorenzo Colitti --- lib/libnetlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 16582144..d6b5fd3e 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -419,8 +419,10 @@ int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, return 0; } - fprintf(stderr, "RTNETLINK answers: %s\n", - strerror(-err->error)); + if (rtnl->proto != NETLINK_SOCK_DIAG) + fprintf(stderr, + "RTNETLINK answers: %s\n", + strerror(-err->error)); errno = -err->error; return -1; } From fb2594c183fbedbe8f91fe7b1f7fed1331bb3194 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 8 Jan 2016 17:32:37 +0900 Subject: [PATCH 151/151] ss: support closing inet sockets via SOCK_DESTROY. This patch adds a -K / --kill option to ss that attempts to forcibly close matching sockets using SOCK_DESTROY. Because ss typically prints sockets instead of acting on them, and because the kernel only supports forcibly closing some types of sockets, the output of -K is as follows: - If closing the socket succeeds, the socket is printed. - If the kernel does not support forcibly closing this type of socket (e.g., if it's a UDP socket, or a TIME_WAIT socket), the socket is silently skipped. - If an error occurs (e.g., permission denied), the error is reported and ss exits. Signed-off-by: Lorenzo Colitti --- man/man8/ss.8 | 5 +++++ misc/ss.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/man/man8/ss.8 b/man/man8/ss.8 index f4d5264f..758460c2 100644 --- a/man/man8/ss.8 +++ b/man/man8/ss.8 @@ -48,6 +48,11 @@ Show process using socket. .B \-i, \-\-info Show internal TCP information. .TP +.B \-K, \-\-kill +Attempts to forcibly close sockets. This option displays sockets that are +successfully closed and silently skips sockets that the kernel does not support +closing. It supports IPv4 and IPv6 sockets only. +.TP .B \-s, \-\-summary Print summary statistics. This option does not parse socket lists obtaining summary from various sources. It is useful when amount of sockets is so huge diff --git a/misc/ss.c b/misc/ss.c index 0dab32ce..13fcc8f6 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -160,6 +160,7 @@ struct filter int states; int families; struct ssfilter *f; + bool kill; }; static const struct filter default_dbs[MAX_DB] = { @@ -2194,8 +2195,27 @@ static int sockdiag_send(int family, int fd, int protocol, struct filter *f) struct inet_diag_arg { struct filter *f; int protocol; + struct rtnl_handle *rth; }; +static int kill_inet_sock(const struct sockaddr_nl *addr, + struct nlmsghdr *h, void *arg) +{ + struct inet_diag_msg *d = NLMSG_DATA(h); + struct inet_diag_arg *diag_arg = arg; + struct rtnl_handle *rth = diag_arg->rth; + DIAG_REQUEST(req, struct inet_diag_req_v2 r); + + req.nlh.nlmsg_type = SOCK_DESTROY; + req.nlh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nlh.nlmsg_seq = ++rth->seq; + req.r.sdiag_family = d->idiag_family; + req.r.sdiag_protocol = diag_arg->protocol; + req.r.id = d->id; + + return rtnl_talk(rth, &req.nlh, NULL, 0); +} + static int show_one_inet_sock(const struct sockaddr_nl *addr, struct nlmsghdr *h, void *arg) { @@ -2205,6 +2225,15 @@ static int show_one_inet_sock(const struct sockaddr_nl *addr, if (!(diag_arg->f->families & (1 << r->idiag_family))) return 0; + if (diag_arg->f->kill && kill_inet_sock(addr, h, arg) != 0) { + if (errno == EOPNOTSUPP || errno == ENOENT) { + /* Socket can't be closed, or is already closed. */ + return 0; + } else { + perror("SOCK_DESTROY answers"); + return -1; + } + } if ((err = inet_show_sock(h, diag_arg->f, diag_arg->protocol)) < 0) return err; @@ -2214,12 +2243,21 @@ static int show_one_inet_sock(const struct sockaddr_nl *addr, static int inet_show_netlink(struct filter *f, FILE *dump_fp, int protocol) { int err = 0; - struct rtnl_handle rth; + struct rtnl_handle rth, rth2; int family = PF_INET; struct inet_diag_arg arg = { .f = f, .protocol = protocol }; if (rtnl_open_byproto(&rth, 0, NETLINK_SOCK_DIAG)) return -1; + + if (f->kill) { + if (rtnl_open_byproto(&rth2, 0, NETLINK_SOCK_DIAG)) { + rtnl_close(&rth); + return -1; + } + arg.rth = &rth2; + } + rth.dump = MAGIC_SEQ; rth.dump_fp = dump_fp; if (preferred_family == PF_INET6) @@ -2243,6 +2281,8 @@ again: Exit: rtnl_close(&rth); + if (arg.rth) + rtnl_close(arg.rth); return err; } @@ -3489,6 +3529,8 @@ static void _usage(FILE *dest) " -x, --unix display only Unix domain sockets\n" " -f, --family=FAMILY display sockets of type FAMILY\n" "\n" +" -K, --kill forcibly close sockets, display what was closed\n" +"\n" " -A, --query=QUERY, --socket=QUERY\n" " QUERY := {all|inet|tcp|udp|raw|unix|unix_dgram|unix_stream|unix_seqpacket|packet|netlink}[,QUERY]\n" "\n" @@ -3579,6 +3621,7 @@ static const struct option long_opts[] = { { "context", 0, 0, 'Z' }, { "contexts", 0, 0, 'z' }, { "net", 1, 0, 'N' }, + { "kill", 0, 0, 'K' }, { 0 } }; @@ -3593,7 +3636,7 @@ int main(int argc, char *argv[]) int ch; int state_filter = 0; - while ((ch = getopt_long(argc, argv, "dhaletuwxnro460spbEf:miA:D:F:vVzZN:", + while ((ch = getopt_long(argc, argv, "dhaletuwxnro460spbEf:miA:D:F:vVzZN:K", long_opts, NULL)) != EOF) { switch(ch) { case 'n': @@ -3774,6 +3817,9 @@ int main(int argc, char *argv[]) if (netns_switch(optarg)) exit(1); break; + case 'K': + current_filter.kill = 1; + break; case 'h': help(); case '?':