From b5a754b1db4e1a13d12f12bebbccc82c5dcd3f1d Mon Sep 17 00:00:00 2001 From: Tobias Jungel Date: Thu, 21 Mar 2019 09:32:39 +0100 Subject: [PATCH 01/10] ip: bridge: add mcast to unicast config flag This adds configuration for the IFLA_BRPORT_MCAST_TO_UCAST flag that allows multicast packets to be replicated as unicast packets. Signed-off-by: Tobias Jungel Signed-off-by: Stephen Hemminger --- bridge/link.c | 12 ++++++++++++ ip/iplink_bridge_slave.c | 9 +++++++++ man/man8/bridge.8 | 5 +++++ man/man8/ip-link.8.in | 5 +++++ 4 files changed, 31 insertions(+) diff --git a/bridge/link.c b/bridge/link.c index 32317e53..04cfc144 100644 --- a/bridge/link.c +++ b/bridge/link.c @@ -146,6 +146,9 @@ static void print_protinfo(FILE *fp, struct rtattr *attr) if (prtb[IFLA_BRPORT_MCAST_FLOOD]) print_onoff(fp, "mcast_flood", rta_getattr_u8(prtb[IFLA_BRPORT_MCAST_FLOOD])); + if (prtb[IFLA_BRPORT_MCAST_TO_UCAST]) + print_onoff(fp, "mcast_to_unicast", + rta_getattr_u8(prtb[IFLA_BRPORT_MCAST_TO_UCAST])); if (prtb[IFLA_BRPORT_NEIGH_SUPPRESS]) print_onoff(fp, "neigh_suppress", rta_getattr_u8(prtb[IFLA_BRPORT_NEIGH_SUPPRESS])); @@ -260,6 +263,7 @@ static void usage(void) fprintf(stderr, " [ learning_sync {on | off} ]\n"); fprintf(stderr, " [ flood {on | off} ]\n"); fprintf(stderr, " [ mcast_flood {on | off} ]\n"); + fprintf(stderr, " [ mcast_to_unicast {on | off} ]\n"); fprintf(stderr, " [ neigh_suppress {on | off} ]\n"); fprintf(stderr, " [ vlan_tunnel {on | off} ]\n"); fprintf(stderr, " [ isolated {on | off} ]\n"); @@ -306,6 +310,7 @@ static int brlink_modify(int argc, char **argv) __s8 flood = -1; __s8 vlan_tunnel = -1; __s8 mcast_flood = -1; + __s8 mcast_to_unicast = -1; __s8 isolated = -1; __s8 hairpin = -1; __s8 bpdu_guard = -1; @@ -354,6 +359,10 @@ static int brlink_modify(int argc, char **argv) NEXT_ARG(); if (!on_off("mcast_flood", &mcast_flood, *argv)) return -1; + } else if (strcmp(*argv, "mcast_to_unicast") == 0) { + NEXT_ARG(); + if (!on_off("mcast_to_unicast", &mcast_to_unicast, *argv)) + return -1; } else if (strcmp(*argv, "cost") == 0) { NEXT_ARG(); cost = atoi(*argv); @@ -453,6 +462,9 @@ static int brlink_modify(int argc, char **argv) if (mcast_flood >= 0) addattr8(&req.n, sizeof(req), IFLA_BRPORT_MCAST_FLOOD, mcast_flood); + if (mcast_to_unicast >= 0) + addattr8(&req.n, sizeof(req), IFLA_BRPORT_MCAST_TO_UCAST, + mcast_to_unicast); if (learning >= 0) addattr8(&req.n, sizeof(req), IFLA_BRPORT_LEARNING, learning); if (learning_sync >= 0) diff --git a/ip/iplink_bridge_slave.c b/ip/iplink_bridge_slave.c index 85e6b424..ae9d15fc 100644 --- a/ip/iplink_bridge_slave.c +++ b/ip/iplink_bridge_slave.c @@ -37,6 +37,7 @@ static void print_explain(FILE *f) " [ mcast_router MULTICAST_ROUTER ]\n" " [ mcast_fast_leave {on | off} ]\n" " [ mcast_flood {on | off} ]\n" + " [ mcast_to_unicast {on | off} ]\n" " [ group_fwd_mask MASK ]\n" " [ neigh_suppress {on | off} ]\n" " [ vlan_tunnel {on | off} ]\n" @@ -257,6 +258,10 @@ static void bridge_slave_print_opt(struct link_util *lu, FILE *f, _print_onoff(f, "mcast_flood", "mcast_flood", rta_getattr_u8(tb[IFLA_BRPORT_MCAST_FLOOD])); + if (tb[IFLA_BRPORT_MCAST_TO_UCAST]) + _print_onoff(f, "mcast_to_unicast", "mcast_to_unicast", + rta_getattr_u8(tb[IFLA_BRPORT_MCAST_TO_UCAST])); + if (tb[IFLA_BRPORT_NEIGH_SUPPRESS]) _print_onoff(f, "neigh_suppress", "neigh_suppress", rta_getattr_u8(tb[IFLA_BRPORT_NEIGH_SUPPRESS])); @@ -357,6 +362,10 @@ static int bridge_slave_parse_opt(struct link_util *lu, int argc, char **argv, NEXT_ARG(); bridge_slave_parse_on_off("mcast_flood", *argv, n, IFLA_BRPORT_MCAST_FLOOD); + } else if (matches(*argv, "mcast_to_unicast") == 0) { + NEXT_ARG(); + bridge_slave_parse_on_off("mcast_to_unicast", *argv, n, + IFLA_BRPORT_MCAST_TO_UCAST); } else if (matches(*argv, "proxy_arp") == 0) { NEXT_ARG(); bridge_slave_parse_on_off("proxy_arp", *argv, n, diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index c9af20e8..06e3fdb3 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -47,6 +47,7 @@ bridge \- show / manipulate bridge addresses and devices .BR flood " { " on " | " off " } ] [ " .BR hwmode " { " vepa " | " veb " } ] [ " .BR mcast_flood " { " on " | " off " } ] [ " +.BR mcast_to_unicast " { " on " | " off " } ] [ " .BR neigh_suppress " { " on " | " off " } ] [ " .BR vlan_tunnel " { " on " | " off " } ] [ " .BR isolated " { " on " | " off " } ] [ " @@ -365,6 +366,10 @@ switch. .BR "mcast_flood on " or " mcast_flood off " Controls whether a given port will flood multicast traffic for which there is no MDB entry. By default this flag is on. +.TP +.BR "mcast_to_unicast on " or " mcast_to_unicast off " +Controls whether a given port will replicate packets using unicast instead of multicast. By default this flag is off. + .TP .BR "neigh_suppress on " or " neigh_suppress off " Controls whether neigh discovery (arp and nd) proxy and suppression is enabled on the port. By default this flag is off. diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 6f31453c..716421f4 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -2100,6 +2100,8 @@ the following additional arguments are supported: ] [ .BR mcast_flood " { " on " | " off " }" ] [ +.BR mcast_to_unicast " { " on " | " off " }" +] [ .BR group_fwd_mask " MASK" ] [ .BR neigh_suppress " { " on " | " off " }" @@ -2185,6 +2187,9 @@ option above. .BR mcast_flood " { " on " | " off " }" - controls whether a given port will flood multicast traffic for which there is no MDB entry. +.BR mcast_to_unicast " { " on " | " off " }" +- controls whether a given port will replicate packets using unicast instead of multicast. By default this flag is off. + .BI group_fwd_mask " MASK " - set the group forward mask. This is the bitmask that is applied to decide whether to forward incoming frames destined to link-local addresses, ie addresses of the form 01:80:C2:00:00:0X (defaults to 0, ie the bridge does not forward any link-local frames coming on this port). From f76ad635f21dba1d1430bcdbf830097c2ecde237 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 22 Mar 2019 09:46:40 -0700 Subject: [PATCH 02/10] man: break long lines in man page sources No impact for output, just easier to edit. Signed-off-by: Stephen Hemminger --- man/man8/bridge.8 | 26 +++--- man/man8/ip-link.8.in | 180 +++++++++++++++++++++++++++--------------- man/man8/ip.8 | 25 +++--- man/man8/ss.8 | 62 +++++++++------ 4 files changed, 190 insertions(+), 103 deletions(-) diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index 06e3fdb3..bb4fb521 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -364,28 +364,34 @@ switch. .TP .BR "mcast_flood on " or " mcast_flood off " -Controls whether a given port will flood multicast traffic for which there is no MDB entry. By default this flag is on. +Controls whether a given port will flood multicast traffic for which +there is no MDB entry. By default this flag is on. .TP .BR "mcast_to_unicast on " or " mcast_to_unicast off " -Controls whether a given port will replicate packets using unicast instead of multicast. By default this flag is off. +Controls whether a given port will replicate packets using unicast +instead of multicast. By default this flag is off. .TP .BR "neigh_suppress on " or " neigh_suppress off " -Controls whether neigh discovery (arp and nd) proxy and suppression is enabled on the port. By default this flag is off. +Controls whether neigh discovery (arp and nd) proxy and suppression is +enabled on the port. By default this flag is off. .TP .BR "vlan_tunnel on " or " vlan_tunnel off " -Controls whether vlan to tunnel mapping is enabled on the port. By default this flag is off. +Controls whether vlan to tunnel mapping is enabled on the port. By +default this flag is off. .TP .BR "isolated on " or " isolated off " -Controls whether a given port will be isolated, which means it will be able to communicate with non-isolated ports only. -By default this flag is off. +Controls whether a given port will be isolated, which means it will be +able to communicate with non-isolated ports only. By default this +flag is off. .TP .BI backup_port " DEVICE" -If the port loses carrier all traffic will be redirected to the configured backup port +If the port loses carrier all traffic will be redirected to the +configured backup port .TP .BR nobackup_port @@ -639,9 +645,9 @@ the VLAN ID that identifies the vlan. .TP .BI tunnel_info " TUNNEL_ID" -the TUNNEL ID that maps to this vlan. The tunnel id is set in dst_metadata for -every packet that belongs to this vlan (applicable to bridge ports with vlan_tunnel -flag set). +the TUNNEL ID that maps to this vlan. The tunnel id is set in +dst_metadata for every packet that belongs to this vlan (applicable to +bridge ports with vlan_tunnel flag set). .TP .BI pvid diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 716421f4..988314e1 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -362,15 +362,18 @@ specifies the number of receive queues for new device. .TP .BI gso_max_size " BYTES " -specifies the recommended maximum size of a Generic Segment Offload packet the new device should accept. +specifies the recommended maximum size of a Generic Segment Offload +packet the new device should accept. .TP .BI gso_max_segs " SEGMENTS " -specifies the recommended maximum number of a Generic Segment Offload segments the new device should accept. +specifies the recommended maximum number of a Generic Segment Offload +segments the new device should accept. .TP .BI index " IDX " -specifies the desired index of the new virtual device. The link creation fails, if the index is busy. +specifies the desired index of the new virtual device. The link +creation fails, if the index is busy. .TP VLAN Type Support @@ -420,13 +423,14 @@ the following additional arguments are supported: .in +4 If .BR reorder_hdr " is " on -then VLAN header will be not inserted immediately but only before passing to the -physical device (if this device does not support VLAN offloading), the similar -on the RX direction - by default the packet will be untagged before being -received by VLAN device. Reordering allows to accelerate tagging on egress and -to hide VLAN header on ingress so the packet looks like regular Ethernet packet, -at the same time it might be confusing for packet capture as the VLAN header -does not exist within the packet. +then VLAN header will be not inserted immediately but only before +passing to the physical device (if this device does not support VLAN +offloading), the similar on the RX direction - by default the packet +will be untagged before being received by VLAN device. Reordering +allows to accelerate tagging on egress and to hide VLAN header on +ingress so the packet looks like regular Ethernet packet, at the same +time it might be confusing for packet capture as the VLAN header does +not exist within the packet. VLAN offloading can be checked by .BR ethtool "(8):" @@ -441,10 +445,12 @@ where is the physical device to which VLAN device is bound. .in -4 .BR gvrp " { " on " | " off " } " -- specifies whether this VLAN should be registered using GARP VLAN Registration Protocol. +- specifies whether this VLAN should be registered using GARP VLAN + Registration Protocol. .BR mvrp " { " on " | " off " } " -- specifies whether this VLAN should be registered using Multiple VLAN Registration Protocol. +- specifies whether this VLAN should be registered using Multiple VLAN + Registration Protocol. .BR loose_binding " { " on " | " off " } " - specifies whether the VLAN device state is bound to the physical device state. @@ -467,7 +473,8 @@ Linux packet priority can be set by -t mangle -A POSTROUTING [...] -j CLASSIFY --set-class 0:4 .sp .in -4 -and this "4" priority can be used in the egress qos mapping to set VLAN prio "5": +and this "4" priority can be used in the egress qos mapping to set +VLAN prio "5": .sp .in +4 .B ip @@ -585,7 +592,8 @@ bit is not set. .sp .BI dstport " PORT" -- specifies the UDP destination port to communicate to the remote VXLAN tunnel endpoint. +- specifies the UDP destination port to communicate to the remote + VXLAN tunnel endpoint. .sp .BI srcport " MIN MAX" @@ -1424,22 +1432,39 @@ the following additional arguments are supported: .in +8 .sp .BI ageing_time " AGEING_TIME " -- configure the bridge's FDB entries ageing time, ie the number of seconds a MAC address will be kept in the FDB after a packet has been received from that address. after this time has passed, entries are cleaned up. +- configure the bridge's FDB entries ageing time, ie the number of +seconds a MAC address will be kept in the FDB after a packet has been +received from that address. after this time has passed, entries are +cleaned up. .BI group_fwd_mask " MASK " -- set the group forward mask. This is the bitmask that is applied to decide whether to forward incoming frames destined to link-local addresses, ie addresses of the form 01:80:C2:00:00:0X (defaults to 0, ie the bridge does not forward any link-local frames). +- set the group forward mask. This is the bitmask that is applied to +decide whether to forward incoming frames destined to link-local +addresses, ie addresses of the form 01:80:C2:00:00:0X (defaults to 0, +ie the bridge does not forward any link-local frames). .BI group_address " ADDRESS " -- set the MAC address of the multicast group this bridge uses for STP. The address must be a link-local address in standard Ethernet MAC address format, ie an address of the form 01:80:C2:00:00:0X, with X in [0, 4..f]. +- set the MAC address of the multicast group this bridge uses for STP. +The address must be a link-local address in standard Ethernet MAC +address format, ie an address of the form 01:80:C2:00:00:0X, with X + in [0, 4..f]. .BI forward_delay " FORWARD_DELAY " -- set the forwarding delay in seconds, ie the time spent in LISTENING state (before moving to LEARNING) and in LEARNING state (before moving to FORWARDING). Only relevant if STP is enabled. Valid values are between 2 and 30. +- set the forwarding delay in seconds, ie the time spent in LISTENING +state (before moving to LEARNING) and in LEARNING state (before +moving to FORWARDING). Only relevant if STP is enabled. Valid values +are between 2 and 30. .BI hello_time " HELLO_TIME " -- set the time in seconds between hello packets sent by the bridge, when it is a root bridge or a designated bridges. Only relevant if STP is enabled. Valid values are between 1 and 10. +- set the time in seconds between hello packets sent by the bridge, +when it is a root bridge or a designated bridges. +Only relevant if STP is enabled. Valid values are between 1 and 10. .BI max_age " MAX_AGE " -- set the hello packet timeout, ie the time in seconds until another bridge in the spanning tree is assumed to be dead, after reception of its last hello message. Only relevant if STP is enabled. Valid values are between 6 and 40. +- set the hello packet timeout, ie the time in seconds until another +bridge in the spanning tree is assumed to be dead, after reception of +its last hello message. Only relevant if STP is enabled. Valid values +are between 6 and 40. .BI stp_state " STP_STATE " - turn spanning tree protocol on @@ -1449,7 +1474,8 @@ or off for this bridge. .BI priority " PRIORITY " -- set this bridge's spanning tree priority, used during STP root bridge election. +- set this bridge's spanning tree priority, used during STP root +bridge election. .I PRIORITY is a 16bit unsigned integer. @@ -1509,21 +1535,28 @@ or disable IGMP querier, ie sending of multicast queries by the bridge (default: disabled). .BI mcast_querier_interval " QUERIER_INTERVAL " -- interval between queries sent by other routers. if no queries are seen after this delay has passed, the bridge will start to send its own queries (as if +- interval between queries sent by other routers. if no queries are seen +after this delay has passed, the bridge will start to send its own queries +(as if .BI mcast_querier was enabled). .BI mcast_hash_elasticity " HASH_ELASTICITY " -- set multicast database hash elasticity, ie the maximum chain length in the multicast hash table (defaults to 4). +- set multicast database hash elasticity, ie the maximum chain length +in the multicast hash table (defaults to 4). .BI mcast_hash_max " HASH_MAX " -- set maximum size of multicast hash table (defaults to 512, value must be a power of 2). +- set maximum size of multicast hash table (defaults to 512, +value must be a power of 2). .BI mcast_last_member_count " LAST_MEMBER_COUNT " -- set multicast last member count, ie the number of queries the bridge will send before stopping forwarding a multicast group after a "leave" message has been received (defaults to 2). +- set multicast last member count, ie the number of queries the bridge +will send before stopping forwarding a multicast group after a "leave" +message has been received (defaults to 2). .BI mcast_last_member_interval " LAST_MEMBER_INTERVAL " -- interval between queries to find remaining members of a group, after a "leave" message is received. +- interval between queries to find remaining members of a group, +after a "leave" message is received. .BI mcast_startup_query_count " STARTUP_QUERY_COUNT " - set the number of IGMP queries to send during startup phase (defaults to 2). @@ -1532,13 +1565,16 @@ was enabled). - interval between queries in the startup phase. .BI mcast_query_interval " QUERY_INTERVAL " -- interval between queries sent by the bridge after the end of the startup phase. +- interval between queries sent by the bridge after the end of the +startup phase. .BI mcast_query_response_interval " QUERY_RESPONSE_INTERVAL " -- set the Max Response Time/Maximum Response Delay for IGMP/MLD queries sent by the bridge. +- set the Max Response Time/Maximum Response Delay for IGMP/MLD +queries sent by the bridge. .BI mcast_membership_interval " MEMBERSHIP_INTERVAL " -- delay after which the bridge will leave a group, if no membership reports for this group are received. +- delay after which the bridge will leave a group, +if no membership reports for this group are received. .BI mcast_stats_enabled " MCAST_STATS_ENABLED " - enable @@ -1615,9 +1651,9 @@ the following additional arguments are supported: .sp .BI port " PORT " -- sets the port number component of secure channel for this MACsec device, in a -range from 1 to 65535 inclusive. Numbers with a leading " 0 " or " 0x " are -interpreted as octal and hexadecimal, respectively. +- sets the port number component of secure channel for this MACsec +device, in a range from 1 to 65535 inclusive. Numbers with a leading " +0 " or " 0x " are interpreted as octal and hexadecimal, respectively. .sp .BI sci " SCI " @@ -1639,7 +1675,8 @@ is a 64bit wide number in hexadecimal format. .sp .BR "send_sci on " or " send_sci off" -- specifies whether the SCI is included in every packet, or only when it is necessary. +- specifies whether the SCI is included in every packet, +or only when it is necessary. .sp .BR "end_station on " or " end_station off" @@ -1736,9 +1773,9 @@ call. .TP .BI dev " DEVICE " .I DEVICE -specifies network device to operate on. When configuring SR-IOV Virtual Function -(VF) devices, this keyword should specify the associated Physical Function (PF) -device. +specifies network device to operate on. When configuring SR-IOV +Virtual Function (VF) devices, this keyword should specify the +associated Physical Function (PF) device. .TP .BI group " GROUP " @@ -1770,13 +1807,16 @@ flag on the device. .BR "protodown on " or " protodown off" change the .B PROTODOWN -state on the device. Indicates that a protocol error has been detected on the port. Switch drivers can react to this error by doing a phys down on the switch port. +state on the device. Indicates that a protocol error has been detected +on the port. Switch drivers can react to this error by doing a phys +down on the switch port. .TP .BR "dynamic on " or " dynamic off" change the .B DYNAMIC -flag on the device. Indicates that address can change when interface goes down (currently +flag on the device. Indicates that address can change when interface +goes down (currently .B NOT used by the Linux). @@ -1821,8 +1861,8 @@ move the device to the network namespace associated with name Some devices are not allowed to change network namespace: loopback, bridge, ppp, wireless. These are network namespace local devices. In such case .B ip -tool will return "Invalid argument" error. It is possible to find out if device is local -to a single network namespace by checking +tool will return "Invalid argument" error. It is possible to find out +if device is local to a single network namespace by checking .B netns-local flag in the output of the .BR ethtool ":" @@ -1834,7 +1874,8 @@ flag in the output of the To change network namespace for wireless devices the .B iw -tool can be used. But it allows to change network namespace only for physical devices and by process +tool can be used. But it allows to change network namespace only for +physical devices and by process .IR PID . .TP @@ -1887,11 +1928,12 @@ as 0 disables VLAN tagging and filtering for the VF. .sp .BI proto " VLAN-PROTO" - assign VLAN PROTOCOL for the VLAN tag, either 802.1Q or 802.1ad. -Setting to 802.1ad, all traffic sent from the VF will be tagged with VLAN S-Tag. -Incoming traffic will have VLAN S-Tags stripped before being passed to the VF. -Setting to 802.1ad also enables an option to concatenate another VLAN tag, so both -S-TAG and C-TAG will be inserted/stripped for outgoing/incoming traffic, respectively. -If not specified, the value is assumed to be 802.1Q. Both the +Setting to 802.1ad, all traffic sent from the VF will be tagged with +VLAN S-Tag. Incoming traffic will have VLAN S-Tags stripped before +being passed to the VF. Setting to 802.1ad also enables an option to +concatenate another VLAN tag, so both S-TAG and C-TAG will be +inserted/stripped for outgoing/incoming traffic, respectively. If not +specified, the value is assumed to be 802.1Q. Both the .B vf and .B vlan @@ -1909,8 +1951,8 @@ option instead. .sp .BI max_tx_rate " TXRATE" -- change the allowed maximum transmit bandwidth, in Mbps, for the specified VF. -Setting this parameter to 0 disables rate limiting. +- change the allowed maximum transmit bandwidth, in Mbps, for the +specified VF. Setting this parameter to 0 disables rate limiting. .B vf parameter must be specified. @@ -1927,17 +1969,21 @@ parameter must be specified. - turn packet spoof checking on or off for the specified VF. .sp .BI query_rss " on|off" -- toggle the ability of querying the RSS configuration of a specific VF. VF RSS information like RSS hash key may be considered sensitive on some devices where this information is shared between VF and PF and thus its querying may be prohibited by default. +- toggle the ability of querying the RSS configuration of a specific + VF. VF RSS information like RSS hash key may be considered sensitive + on some devices where this information is shared between VF and PF + and thus its querying may be prohibited by default. .sp .BI state " auto|enable|disable" -- set the virtual link state as seen by the specified VF. Setting to auto means a -reflection of the PF link state, enable lets the VF to communicate with other VFs on -this host even if the PF link state is down, disable causes the HW to drop any packets -sent by the VF. +- set the virtual link state as seen by the specified VF. Setting to +auto means a reflection of the PF link state, enable lets the VF to +communicate with other VFs on this host even if the PF link state is +down, disable causes the HW to drop any packets sent by the VF. .sp .BI trust " on|off" -- trust the specified VF user. This enables that VF user can set a specific feature -which may impact security and/or performance. (e.g. VF multicast promiscuous mode) +- trust the specified VF user. This enables that VF user can set a +specific feature which may impact security and/or +performance. (e.g. VF multicast promiscuous mode) .sp .BI node_guid " eui64" - configure node GUID for Infiniband VFs. @@ -2045,7 +2091,8 @@ set the IPv6 address generation mode - disable automatic address generation .I stable_secret -- generate the interface identifier based on a preset /proc/sys/net/ipv6/conf/{default,DEVICE}/stable_secret +- generate the interface identifier based on a preset + /proc/sys/net/ipv6/conf/{default,DEVICE}/stable_secret .I random - like stable_secret, but auto-generate a new random secret if none is set @@ -2185,22 +2232,31 @@ queries. option above. .BR mcast_flood " { " on " | " off " }" -- controls whether a given port will flood multicast traffic for which there is no MDB entry. +- controls whether a given port will flood multicast traffic for which + there is no MDB entry. .BR mcast_to_unicast " { " on " | " off " }" -- controls whether a given port will replicate packets using unicast instead of multicast. By default this flag is off. +- controls whether a given port will replicate packets using unicast + instead of multicast. By default this flag is off. .BI group_fwd_mask " MASK " -- set the group forward mask. This is the bitmask that is applied to decide whether to forward incoming frames destined to link-local addresses, ie addresses of the form 01:80:C2:00:00:0X (defaults to 0, ie the bridge does not forward any link-local frames coming on this port). +- set the group forward mask. This is the bitmask that is applied to +decide whether to forward incoming frames destined to link-local +addresses, ie addresses of the form 01:80:C2:00:00:0X (defaults to +0, ie the bridge does not forward any link-local frames coming on +this port). .BR neigh_suppress " { " on " | " off " }" -- controls whether neigh discovery (arp and nd) proxy and suppression is enabled on the port. By default this flag is off. +- controls whether neigh discovery (arp and nd) proxy and suppression +is enabled on the port. By default this flag is off. .BR vlan_tunnel " { " on " | " off " }" -- controls whether vlan to tunnel mapping is enabled on the port. By default this flag is off. +- controls whether vlan to tunnel mapping is enabled on the port. By +default this flag is off. .BI backup_port " DEVICE" -- if the port loses carrier all traffic will be redirected to the configured backup port +- if the port loses carrier all traffic will be redirected to the +configured backup port .BR nobackup_port - removes the currently configured backup port diff --git a/man/man8/ip.8 b/man/man8/ip.8 index 133d40d5..f4cbfc03 100644 --- a/man/man8/ip.8 +++ b/man/man8/ip.8 @@ -72,8 +72,9 @@ First failure will cause termination of ip. .TP .BR "\-force" -Don't terminate ip on errors in batch mode. -If there were any errors during execution of the commands, the application return code will be non zero. +Don't terminate ip on errors in batch mode. If there were any errors +during execution of the commands, the application return code will be +non zero. .TP .BR "\-s" , " \-stats" , " \-statistics" @@ -93,7 +94,8 @@ Zero (0) means loop until all addresses are removed. .TP .BR "\-f" , " \-family " -Specifies the protocol family to use. The protocol family identifier can be one of +Specifies the protocol family to use. The protocol family identifier +can be one of .BR "inet" , " inet6" , " bridge" , " mpls" or .BR link . @@ -174,7 +176,8 @@ to .TP .BR "\-a" , " \-all" -executes specified command over all objects, it depends if command supports this option. +executes specified command over all objects, it depends if command +supports this option. .TP .BR \-c [ color ][ = { always | auto | never } @@ -182,7 +185,8 @@ Configure color output. If parameter is omitted or .BR always , color output is enabled regardless of stdout state. If parameter is .BR auto , -stdout is checked to be a terminal before enabling color output. If parameter is +stdout is checked to be a terminal before enabling color output. If +parameter is .BR never , color output is disabled. If specified multiple times, the last one takes precedence. This flag is ignored if @@ -215,7 +219,8 @@ print human readable rates in IEC units (e.g. 1Ki = 1024). .TP .BR "\-br" , " \-brief" -Print only basic information in a tabular format for better readability. This option is currently only supported by +Print only basic information in a tabular format for better +readability. This option is currently only supported by .BR "ip addr show " and " ip link show " commands. .TP @@ -224,8 +229,9 @@ Output results in JavaScript Object Notation (JSON). .TP .BR "\-p", " \-pretty" -The default JSON format is compact and more efficient to parse but hard for most users to read. -This flag adds indentation for readability. +The default JSON format is compact and more efficient to parse but +hard for most users to read. This flag adds indentation for +readability. .SH IP - COMMAND SYNTAX @@ -339,7 +345,8 @@ or, if the objects of this class cannot be listed, .SH ENVIRONMENT .TP .B COLORFGBG -If set, it's value is used for detection whether background is dark or light and use contrast colors for it. +If set, it's value is used for detection whether background is dark or +light and use contrast colors for it. COLORFGBG environment variable usually contains either two or three values separated by semicolons; we want the last value in either case. diff --git a/man/man8/ss.8 b/man/man8/ss.8 index b5099c2f..03a3dcc6 100644 --- a/man/man8/ss.8 +++ b/man/man8/ss.8 @@ -9,11 +9,11 @@ ss \- another utility to investigate sockets is used to dump socket statistics. It allows showing information similar to .IR netstat . -It can display more TCP and state informations than other tools. +It can display more TCP and state information than other tools. .SH OPTIONS -When no option is used ss displays a list of -open non-listening sockets (e.g. TCP/UNIX/UDP) that have established connection. +When no option is used ss displays a list of open non-listening +sockets (e.g. TCP/UNIX/UDP) that have established connection. .TP .B \-h, \-\-help Show summary of options. @@ -31,13 +31,14 @@ Do not try to resolve service names. Try to resolve numeric address/ports. .TP .B \-a, \-\-all -Display both listening and non-listening (for TCP this means established connections) sockets. +Display both listening and non-listening (for TCP this means +established connections) sockets. .TP .B \-l, \-\-listening Display only listening sockets (these are omitted by default). .TP .B \-o, \-\-options -Show timer information. For tcp protocol, the output format is: +Show timer information. For TCP protocol, the output format is: .RS .P timer:(,,) @@ -47,7 +48,9 @@ timer:(,,) the name of the timer, there are five kind of timer names: .RS .P -.BR on ": means one of these timers: tcp retrans timer, tcp early retrans timer and tail loss probe timer" +.B on +: means one of these timers: TCP retrans timer, TCP early retrans +timer and tail loss probe timer .P .BR keepalive ": tcp keep alive timer" .P @@ -63,7 +66,7 @@ how long time the timer will expire .P .TP .B -how many times the retran occurs +how many times the retransmission occured .RE .TP .B \-e, \-\-extended @@ -121,19 +124,25 @@ the total memory can be allocated for sending packet .P .TP .B -the memory allocated by the socket as cache, but not used for receiving/sending packet yet. If need memory to send/receive packet, the memory in this cache will be used before allocate additional memory. +the memory allocated by the socket as cache, but not used for +receiving/sending packet yet. If need memory to send/receive packet, +the memory in this cache will be used before allocate additional +memory. .P .TP .B The memory allocated for sending packet (which has not been sent to layer 3) .P .TP -.B +.B The memory used for storing socket option, e.g., the key for TCP MD5 signature .P .TP .B -The memory used for the sk backlog queue. On a process context, if the process is receiving packet, and a new packet is received, it will be put into the sk backlog queue, so it can be received by the process immediately +The memory used for the sk backlog queue. On a process context, if the +process is receiving packet, and a new packet is received, it will be +put into the sk backlog queue, so it can be received by the process +immediately .RE .TP .B \-p, \-\-processes @@ -169,7 +178,8 @@ the congestion algorithm name, the default congestion algorithm is "cubic" .P .TP .B wscale:: -if window scale option is used, this field shows the send scale factor and receive scale factor +if window scale option is used, this field shows the send scale factor +and receive scale factor .P .TP .B rto: @@ -177,11 +187,13 @@ tcp re-transmission timeout value, the unit is millisecond .P .TP .B backoff: -used for exponential backoff re-transmission, the actual re-transmission timeout value is icsk_rto << icsk_backoff +used for exponential backoff re-transmission, the actual +re-transmission timeout value is icsk_rto << icsk_backoff .P .TP .B rtt:/ -rtt is the average round trip time, rttvar is the mean deviation of rtt, their units are millisecond +rtt is the average round trip time, rttvar is the mean deviation of +rtt, their units are millisecond .P .TP .B ato: @@ -258,7 +270,8 @@ IPv6 Traffic Class byte .P .TP .B class_id -Class id set by net_cls cgroup. If class is zero this shows priority set by SO_PRIORITY. +Class id set by net_cls cgroup. If class is zero this shows priority +set by SO_PRIORITY. .RE .TP .B \-K, \-\-kill @@ -309,7 +322,8 @@ and is therefore a useful reference. Switch to the specified network namespace name. .TP .B \-b, \-\-bpf -Show socket BPF filters (only administrators are allowed to get these information). +Show socket BPF filters (only administrators are allowed to get these +information). .TP .B \-4, \-\-ipv4 Display only IP version 4 sockets (alias for -f inet). @@ -345,8 +359,8 @@ Display vsock sockets (alias for -f vsock). Display XDP sockets (alias for -f xdp). .TP .B \-f FAMILY, \-\-family=FAMILY -Display sockets of type FAMILY. -Currently the following families are supported: unix, inet, inet6, link, netlink, vsock, xdp. +Display sockets of type FAMILY. Currently the following families are +supported: unix, inet, inet6, link, netlink, vsock, xdp. .TP .B \-A QUERY, \-\-query=QUERY, \-\-socket=QUERY List of socket tables to dump, separated by commas. The following identifiers @@ -358,11 +372,12 @@ prefixed by an exclamation mark to exclude that socket table from being dumped. .TP .B \-D FILE, \-\-diag=FILE -Do not display anything, just dump raw information about TCP sockets to FILE after applying filters. If FILE is - stdout is used. +Do not display anything, just dump raw information about TCP sockets +to FILE after applying filters. If FILE is - stdout is used. .TP .B \-F FILE, \-\-filter=FILE -Read filter information from FILE. -Each line of FILE is interpreted like single command line option. If FILE is - stdin is used. +Read filter information from FILE. Each line of FILE is interpreted +like single command line option. If FILE is - stdin is used. .TP .B FILTER := [ state STATE-FILTER ] [ EXPRESSION ] Please take a look at the official documentation for details regarding filters. @@ -370,7 +385,9 @@ Please take a look at the official documentation for details regarding filters. .SH STATE-FILTER .B STATE-FILTER -allows to construct arbitrary set of states to match. Its syntax is sequence of keywords state and exclude followed by identifier of state. +allows to construct arbitrary set of states to match. Its syntax is +sequence of keywords state and exclude followed by identifier of +state. .TP Available identifiers are: @@ -417,7 +434,8 @@ Display all established ssh connections. Find all local processes connected to X server. .TP .B ss -o state fin-wait-1 '( sport = :http or sport = :https )' dst 193.233.7/24 -List all the tcp sockets in state FIN-WAIT-1 for our apache to network 193.233.7/24 and look at their timers. +List all the tcp sockets in state FIN-WAIT-1 for our apache to network +193.233.7/24 and look at their timers. .TP .B ss -a -A 'all,!tcp' List sockets in all states from all socket tables but TCP. From 6754e1d9783458550dce8d309efb4091ec8089a5 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 27 Mar 2019 07:56:07 -0700 Subject: [PATCH 03/10] ip: fix typo in iplink_vlan usage message Need to use bar "|" rather than slash to indicate alternatives. Signed-off-by: Stephen Hemminger --- ip/iplink_vlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/iplink_vlan.c b/ip/iplink_vlan.c index 74f4614a..08e49956 100644 --- a/ip/iplink_vlan.c +++ b/ip/iplink_vlan.c @@ -31,7 +31,7 @@ static void print_explain(FILE *f) " [ egress-qos-map QOS-MAP ]\n" "\n" "VLANID := 0-4095\n" - "VLANPROTO: [ 802.1Q / 802.1ad ]\n" + "VLANPROTO: [ 802.1Q | 802.1ad ]\n" "QOS-MAP := [ QOS-MAP ] QOS-MAPPING\n" "QOS-MAPPING := FROM:TO\n" ); From 492ec9558b304618389fd536f36e86062287f67f Mon Sep 17 00:00:00 2001 From: Leslie Monis Date: Wed, 27 Mar 2019 21:06:05 +0530 Subject: [PATCH 04/10] tc: pie: change maximum integer value of tc_pie_xstats->prob tc_pie_xstats->prob has a maximum value of (2^64 - 1). Signed-off-by: Leslie Monis Signed-off-by: Stephen Hemminger --- tc/q_pie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tc/q_pie.c b/tc/q_pie.c index f7924ef5..236ea31b 100644 --- a/tc/q_pie.c +++ b/tc/q_pie.c @@ -198,7 +198,7 @@ static int pie_print_xstats(struct qdisc_util *qu, FILE *f, st = RTA_DATA(xstats); /*prob is returned as a fracion of maximum integer value */ fprintf(f, "prob %f delay %uus avg_dq_rate %u\n", - (double)st->prob / (double)0xffffffff, st->delay, + (double)st->prob / UINT64_MAX, st->delay, st->avg_dq_rate); fprintf(f, "pkts_in %u overlimit %u dropped %u maxq %u ecn_mark %u\n", st->packets_in, st->overlimit, st->dropped, st->maxq, From 519ace17f9c4b1aa4a2e1adbba265cf8ad2a76eb Mon Sep 17 00:00:00 2001 From: Leslie Monis Date: Wed, 27 Mar 2019 21:06:06 +0530 Subject: [PATCH 05/10] tc: pie: update man page Update man page to reflect the changes made in Linux. Signed-off-by: Leslie Monis Signed-off-by: Stephen Hemminger --- man/man8/tc-pie.8 | 40 ++++++++++++++++++---------------------- man/man8/tc.8 | 1 + 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/man/man8/tc-pie.8 b/man/man8/tc-pie.8 index 278293bd..a302132f 100644 --- a/man/man8/tc-pie.8 +++ b/man/man8/tc-pie.8 @@ -60,10 +60,10 @@ limit on the queue size in packets. Incoming packets are dropped when this limit is reached. Default is 1000 packets. .SS target -is the expected queue delay. The default target delay is 20ms. +is the expected queue delay. The default target delay is 15ms. .SS tupdate -is the frequency at which the system drop probability is calculated. The default is 30ms. +is the frequency at which the system drop probability is calculated. The default is 15ms. .SS alpha .SS beta @@ -91,29 +91,27 @@ is turned off. .SH EXAMPLES # tc qdisc add dev eth0 root pie # tc -s qdisc show - qdisc pie 8034: dev eth0 root refcnt 2 limit 200p target 19000us tupdate 29000us alpha 2 beta 20 - Sent 7443524 bytes 7204 pkt (dropped 900, overlimits 0 requeues 0) - backlog 38998b 37p requeues 0 - prob 0.123384 delay 25000us avg_dq_rate 1464840 - pkts_in 7241 overlimit 900 dropped 0 maxq 186 ecn_mark 0 + qdisc pie 8036: dev eth0 root refcnt 2 limit 1000p target 15.0ms tupdate 16.0ms alpha 2 beta 20 + Sent 31216108 bytes 20800 pkt (dropped 80, overlimits 0 requeues 0) + backlog 16654b 11p requeues 0 + prob 0.006161 delay 15666us avg_dq_rate 1159667 + pkts_in 20811 overlimit 0 dropped 80 maxq 50 ecn_mark 0 # tc qdisc add dev eth0 root pie limit 100 target 20ms tupdate 30ms ecn # tc -s qdisc show - qdisc pie 8036: dev eth0 root refcnt 2 limit 200p target 19000 tupdate 29000 alpha 2 beta 20 ecn - Sent 2491922 bytes 2507 pkt (dropped 214, overlimits 0 requeues 0) - backlog 33728b 32p requeues 0 - prob 0.102262 delay 24000us avg_dq_rate 1464840 - pkts_in 2468 overlimit 214 dropped 0 maxq 192 ecn_mark 71 - + qdisc pie 8036: dev eth0 root refcnt 2 limit 100p target 20.0ms tupdate 32.0ms alpha 2 beta 20 ecn + Sent 6591724 bytes 4442 pkt (dropped 27, overlimits 0 requeues 0) + backlog 18168b 12p requeues 0 + prob 0.008845 delay 11348us avg_dq_rate 1342773 + pkts_in 4454 overlimit 0 dropped 27 maxq 65 ecn_mark 0 # tc qdisc add dev eth0 root pie limit 100 target 50ms tupdate 30ms bytemode # tc -s qdisc show - qdisc pie 8036: dev eth0 root refcnt 2 limit 200p target 19000 tupdate 29000 alpha 2 beta 20 ecn - Sent 2491922 bytes 2507 pkt (dropped 214, overlimits 0 requeues 0) - backlog 33728b 32p requeues 0 - prob 0.102262 delay 24000us avg_dq_rate 1464840 - pkts_in 2468 overlimit 214 dropped 0 maxq 192 ecn_mark 71 - + qdisc pie 8036: dev eth0 root refcnt 2 limit 100p target 50.0ms tupdate 32.0ms alpha 2 beta 20 bytemode + Sent 1616274 bytes 1137 pkt (dropped 0, overlimits 0 requeues 0) + backlog 13626b 9p requeues 0 + prob 0.000000 delay 0us avg_dq_rate 0 + pkts_in 1146 overlimit 0 dropped 0 maxq 23 ecn_mark 0 .SH SEE ALSO .BR tc (8), @@ -121,9 +119,7 @@ is turned off. .BR tc-red (8) .SH SOURCES - o IETF draft submission is at http://tools.ietf.org/html/draft-pan-tsvwg-pie-00 - o IEEE Conference on High Performance Switching and Routing 2013 : "PIE: A -Lightweight Control Scheme to Address the Bufferbloat Problem" + o RFC 8033: https://tools.ietf.org/html/rfc8033 .SH AUTHORS PIE was implemented by Vijay Subramanian and Mythili Prabhu, also the authors of diff --git a/man/man8/tc.8 b/man/man8/tc.8 index f98398a3..ab0bad8a 100644 --- a/man/man8/tc.8 +++ b/man/man8/tc.8 @@ -848,6 +848,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2. .BR tc-mqprio (8), .BR tc-pfifo (8), .BR tc-pfifo_fast (8), +.BR tc-pie (8), .BR tc-red (8), .BR tc-route (8), .BR tc-sfb (8), From 41fc3fa04c3893a8669922cfda55de113b0b52c4 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 5 Apr 2019 15:00:48 -0700 Subject: [PATCH 06/10] uapi: update bpf.h Updated bpf.h from 5.1-rc Signed-off-by: Stephen Hemminger --- include/uapi/linux/bpf.h | 188 ++++++++++++++++++++++++++------------- 1 file changed, 127 insertions(+), 61 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a857878f..882a97cc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -502,16 +502,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) - * Description - * Push an element *value* in *map*. *flags* is one of: - * - * **BPF_EXIST** - * If the queue/stack is full, the oldest element is removed to - * make room for this. - * Return - * 0 on success, or a negative error in case of failure. - * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -1435,14 +1425,14 @@ union bpf_attr { * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_addr** contex. + * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return * A 8-byte long non-decreasing number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_ops** contex. + * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return * A 8-byte long non-decreasing number. * @@ -2098,6 +2088,25 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_rc_repeat(void *ctx) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to @@ -2124,26 +2133,7 @@ union bpf_attr { * Return * 0 * - * int bpf_rc_repeat(void *ctx) - * Description - * This helper is used in programs implementing IR decoding, to - * report a successfully decoded repeat key message. This delays - * the generation of a key up event for previously generated - * key down event. - * - * Some IR protocols like NEC have a special IR message for - * repeating last button, for when a button is held down. - * - * The *ctx* should point to the lirc sample as passed into - * the program. - * - * This helper is only available is the kernel was compiled with - * the **CONFIG_BPF_LIRC_MODE2** configuration option set to - * "**y**". - * Return - * 0 - * - * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb) + * u64 bpf_skb_cgroup_id(struct sk_buff *skb) * Description * Return the cgroup v2 id of the socket associated with the *skb*. * This is roughly similar to the **bpf_get_cgroup_classid**\ () @@ -2159,30 +2149,12 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) - * Description - * Return id of cgroup v2 that is ancestor of cgroup associated - * with the *skb* at the *ancestor_level*. The root cgroup is at - * *ancestor_level* zero and each step down the hierarchy - * increments the level. If *ancestor_level* == level of cgroup - * associated with *skb*, then return value will be same as that - * of **bpf_skb_cgroup_id**\ (). - * - * The helper is useful to implement policies based on cgroups - * that are upper in hierarchy than immediate cgroup associated - * with *skb*. - * - * The format of returned id and helper limitations are same as in - * **bpf_skb_cgroup_id**\ (). - * Return - * The id is returned or 0 in case the id could not be retrieved. - * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. * - * void* get_local_storage(void *map, u64 flags) + * void *bpf_get_local_storage(void *map, u64 flags) * Description * Get the pointer to the local storage area. * The type and the size of the local storage is defined @@ -2209,6 +2181,24 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for TCP socket matching *tuple*, optionally in a child @@ -2289,6 +2279,16 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. @@ -2343,29 +2343,94 @@ union bpf_attr { * Return * 0 * + * int bpf_spin_lock(struct bpf_spin_lock *lock) + * Description + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * Return + * 0 + * + * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * Description + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * Return + * 0 + * * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_sock** pointer such - * that all the fields in bpf_sock can be accessed. + * that all the fields in this **bpf_sock** can be accessed. * Return - * A **struct bpf_sock** pointer on success, or NULL in + * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. * * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_tcp_sock** pointer from a * **struct bpf_sock** pointer. - * * Return - * A **struct bpf_tcp_sock** pointer on success, or NULL in + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * * int bpf_skb_ecn_set_ce(struct sk_buf *skb) - * Description - * Sets ECN of IP header to ce (congestion encountered) if - * current value is ect (ECN capable). Works with IPv6 and IPv4. - * Return - * 1 if set, 0 if not set. + * Description + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * Return + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. + * + * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) + * Description + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. + * Return + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2465,7 +2530,8 @@ union bpf_attr { FN(spin_unlock), \ FN(sk_fullsock), \ FN(tcp_sock), \ - FN(skb_ecn_set_ce), + FN(skb_ecn_set_ce), \ + FN(get_listener_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call From d5d27f27d8b4a2939990997c162f16f1d28fe677 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 4 Apr 2019 15:09:10 +0200 Subject: [PATCH 07/10] q_cake: Add support for setting the fwmark option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for the newly added fwmark option to CAKE, which allows overriding the tin selection from the per-packet firewall marks. The fwmark field is a bitmask that is applied to the fwmark to select the tin. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Stephen Hemminger --- man/man8/tc-cake.8 | 16 ++++++++++++++++ tc/q_cake.c | 24 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/man/man8/tc-cake.8 b/man/man8/tc-cake.8 index c62e5547..4112b755 100644 --- a/man/man8/tc-cake.8 +++ b/man/man8/tc-cake.8 @@ -91,6 +91,10 @@ TIME | LIMIT ] .br [ +.BR fwmark +MASK ] +.br +[ .BR ptm | .BR atm @@ -524,6 +528,18 @@ preset on the modern Internet is firmly discouraged. .br Voice (CS7, CS6, EF, VA, TOS4), 25% threshold, reduced Codel interval. +.PP +.B fwmark +MASK +.br + This options turns on fwmark-based overriding of CAKE's tin selection. +If set, the option specifies a bitmask that will be applied to the fwmark +associated with each packet. If the result of this masking is non-zero, the +result will be right-shifted by the number of least-significant unset bits in +the mask value, and the result will be used as a the tin number for that packet. +This can be used to set policies in a firewall script that will override CAKE's +built-in tin selection. + .SH OTHER PARAMETERS .B memlimit LIMIT diff --git a/tc/q_cake.c b/tc/q_cake.c index e827e3f1..307a12c0 100644 --- a/tc/q_cake.c +++ b/tc/q_cake.c @@ -82,6 +82,7 @@ static void explain(void) " [ split-gso* | no-split-gso ]\n" " [ ack-filter | ack-filter-aggressive | no-ack-filter* ]\n" " [ memlimit LIMIT ]\n" +" [ fwmark MASK ]\n" " [ ptm | atm | noatm* ] [ overhead N | conservative | raw* ]\n" " [ mpu N ] [ ingress | egress* ]\n" " (* marks defaults)\n"); @@ -106,6 +107,7 @@ static int cake_parse_opt(struct qdisc_util *qu, int argc, char **argv, int autorate = -1; int ingress = -1; int overhead = 0; + int fwmark = -1; int wash = -1; int nat = -1; int atm = -1; @@ -332,6 +334,16 @@ static int cake_parse_opt(struct qdisc_util *qu, int argc, char **argv, "Illegal value for \"memlimit\": \"%s\"\n", *argv); return -1; } + } else if (strcmp(*argv, "fwmark") == 0) { + unsigned int fwm; + + NEXT_ARG(); + if (get_u32(&fwm, *argv, 0)) { + fprintf(stderr, + "Illegal value for \"fwmark\": \"%s\"\n", *argv); + return -1; + } + fwmark = fwm; } else if (strcmp(*argv, "help") == 0) { explain(); return -1; @@ -376,6 +388,9 @@ static int cake_parse_opt(struct qdisc_util *qu, int argc, char **argv, if (memlimit) addattr_l(n, 1024, TCA_CAKE_MEMORY, &memlimit, sizeof(memlimit)); + if (fwmark != -1) + addattr_l(n, 1024, TCA_CAKE_FWMARK, &fwmark, + sizeof(fwmark)); if (nat != -1) addattr_l(n, 1024, TCA_CAKE_NAT, &nat, sizeof(nat)); if (wash != -1) @@ -409,6 +424,7 @@ static int cake_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) struct rtattr *tb[TCA_CAKE_MAX + 1]; unsigned int interval = 0; unsigned int memlimit = 0; + unsigned int fwmark = 0; __u64 bandwidth = 0; int ack_filter = 0; int split_gso = 0; @@ -507,6 +523,10 @@ static int cake_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) RTA_PAYLOAD(tb[TCA_CAKE_RTT]) >= sizeof(__u32)) { interval = rta_getattr_u32(tb[TCA_CAKE_RTT]); } + if (tb[TCA_CAKE_FWMARK] && + RTA_PAYLOAD(tb[TCA_CAKE_FWMARK]) >= sizeof(__u32)) { + fwmark = rta_getattr_u32(tb[TCA_CAKE_FWMARK]); + } if (wash) print_string(PRINT_FP, NULL, "wash ", NULL); @@ -559,6 +579,10 @@ static int cake_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) sprint_size(memlimit, b1)); } + if (fwmark) + print_uint(PRINT_FP, NULL, "fwmark 0x%x ", fwmark); + print_0xhex(PRINT_JSON, "fwmark", NULL, fwmark); + return 0; } From 286446c1e8c7f5f6eca4959015aa9e482b7adb11 Mon Sep 17 00:00:00 2001 From: Matt Ellison Date: Thu, 4 Apr 2019 10:08:45 -0400 Subject: [PATCH 08/10] ip: support for xfrm interfaces Interfaces take a 'if_id' which is an interface id which can be set on an xfrm policy as its interface lookup key (XFRMA_IF_ID). Signed-off-by: Matt Ellison Signed-off-by: Stephen Hemminger --- ip/Makefile | 2 +- ip/iplink.c | 3 +- ip/link_xfrm.c | 77 +++++++++++++++++++++++++ man/man8/ip-link.8.in | 27 ++++++++- testsuite/tests/ip/link/add_type_xfrm.t | 32 ++++++++++ 5 files changed, 138 insertions(+), 3 deletions(-) create mode 100644 ip/link_xfrm.c create mode 100755 testsuite/tests/ip/link/add_type_xfrm.t diff --git a/ip/Makefile b/ip/Makefile index a88f9366..7ce6e91a 100644 --- a/ip/Makefile +++ b/ip/Makefile @@ -5,7 +5,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \ ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o iplink_dummy.o \ iplink_ifb.o iplink_nlmon.o iplink_team.o iplink_vcan.o iplink_vxcan.o \ iplink_vlan.o link_veth.o link_gre.o iplink_can.o iplink_xdp.o \ - iplink_macvlan.o ipl2tp.o link_vti.o link_vti6.o \ + iplink_macvlan.o ipl2tp.o link_vti.o link_vti6.o link_xfrm.o \ iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o \ link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \ iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o \ diff --git a/ip/iplink.c b/ip/iplink.c index 5a3c9613..7952cb2b 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -121,7 +121,8 @@ void iplink_usage(void) " bridge | bond | team | ipoib | ip6tnl | ipip | sit | vxlan |\n" " gre | gretap | erspan | ip6gre | ip6gretap | ip6erspan |\n" " vti | nlmon | team_slave | bond_slave | bridge_slave |\n" - " ipvlan | ipvtap | geneve | vrf | macsec | netdevsim | rmnet }\n"); + " ipvlan | ipvtap | geneve | vrf | macsec | netdevsim | rmnet |\n" + " xfrm }\n"); } exit(-1); } diff --git a/ip/link_xfrm.c b/ip/link_xfrm.c new file mode 100644 index 00000000..79a902fd --- /dev/null +++ b/ip/link_xfrm.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * link_xfrm.c Virtual XFRM Interface driver module + * + * Authors: Matt Ellison + */ + +#include +#include + +#include "rt_names.h" +#include "utils.h" +#include "ip_common.h" +#include "tunnel.h" + +static void xfrm_print_help(struct link_util *lu, int argc, char **argv, + FILE *f) +{ + fprintf(f, "Usage: ... %-4s dev PHYS_DEV [ if_id IF-ID ]\n", lu->id); + fprintf(f, "\nWhere: IF-ID := { 0x0..0xffffffff }\n"); +} + +static int xfrm_parse_opt(struct link_util *lu, int argc, char **argv, + struct nlmsghdr *n) +{ + unsigned int link = 0; + __u32 if_id = 0; + + while (argc > 0) { + if (!matches(*argv, "dev")) { + NEXT_ARG(); + link = ll_name_to_index(*argv); + if (!link) + exit(nodev(*argv)); + } else if (!matches(*argv, "if_id")) { + NEXT_ARG(); + if (!get_u32(&if_id, *argv, 0)) + addattr32(n, 1024, IFLA_XFRM_IF_ID, if_id); + } else { + xfrm_print_help(lu, argc, argv, stderr); + return -1; + } + argc--; argv++; + } + + if (link) { + addattr32(n, 1024, IFLA_XFRM_LINK, link); + } else { + fprintf(stderr, "must specify physical device\n"); + return -1; + } + + return 0; +} + +static void xfrm_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) +{ + + if (!tb) + return; + + if (tb[IFLA_XFRM_IF_ID]) { + __u32 id = rta_getattr_u32(tb[IFLA_XFRM_IF_ID]); + + print_0xhex(PRINT_ANY, "if_id", "if_id %#llx ", id); + + } + +} + +struct link_util xfrm_link_util = { + .id = "xfrm", + .maxattr = IFLA_XFRM_MAX, + .parse_opt = xfrm_parse_opt, + .print_opt = xfrm_print_opt, + .print_help = xfrm_print_help, +}; diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 988314e1..2411d43e 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -221,7 +221,8 @@ ip-link \- network device configuration .BR vrf " |" .BR macsec " |" .BR netdevsim " |" -.BR rmnet " ]" +.BR rmnet " |" +.BR xfrm " ]" .ti -8 .IR ETYPE " := [ " TYPE " |" @@ -350,6 +351,9 @@ Link types: .sp .BR rmnet - Qualcomm rmnet device +.sp +.BR xfrm +- Virtual xfrm interface .in -8 .TP @@ -1741,6 +1745,27 @@ the following additional arguments are supported: .in -8 +.TP +XFRM Type Support +For a link of type +.I XFRM +the following additional arguments are supported: + +.BI "ip link add " DEVICE " type xfrm dev " PHYS_DEV " [ if_id " IF_ID " ]" + +.in +8 +.sp +.BI dev " PHYS_DEV " +- specifies the underlying physical interface from which transform traffic is sent and received. + +.sp +.BI if_id " IF-ID " +- specifies the hexadecimal lookup key used to send traffic to and from specific xfrm +policies. Policies must be configured with the same key. If not set, the key defaults to +0 and will match any policies which similarly do not have a lookup key configuration. + +.in -8 + .SS ip link delete - delete virtual link .TP diff --git a/testsuite/tests/ip/link/add_type_xfrm.t b/testsuite/tests/ip/link/add_type_xfrm.t new file mode 100755 index 00000000..78ce28e0 --- /dev/null +++ b/testsuite/tests/ip/link/add_type_xfrm.t @@ -0,0 +1,32 @@ +#!/bin/sh + +. lib/generic.sh + +ts_log "[Testing Add XFRM Interface, With IF-ID]" + +PHYS_DEV="lo" +NEW_DEV="$(rand_dev)" +IF_ID="0xf" + +ts_ip "$0" "Add $NEW_DEV xfrm interface" link add dev $NEW_DEV type xfrm dev $PHYS_DEV if_id $IF_ID + +ts_ip "$0" "Show $NEW_DEV xfrm interface" -d link show dev $NEW_DEV +test_on "$NEW_DEV" +test_on "if_id $IF_ID" + +ts_ip "$0" "Del $NEW_DEV xfrm interface" link del dev $NEW_DEV + + +ts_log "[Testing Add XFRM Interface, No IF-ID]" + +PHYS_DEV="lo" +NEW_DEV="$(rand_dev)" +IF_ID="0xf" + +ts_ip "$0" "Add $NEW_DEV xfrm interface" link add dev $NEW_DEV type xfrm dev $PHYS_DEV + +ts_ip "$0" "Show $NEW_DEV xfrm interface" -d link show dev $NEW_DEV +test_on "$NEW_DEV" +test_on_not "if_id $IF_ID" + +ts_ip "$0" "Del $NEW_DEV xfrm interface" link del dev $NEW_DEV From 8391023680d0f0054a8894bb579220a7b9dffb0c Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Thu, 4 Apr 2019 12:21:54 +0200 Subject: [PATCH 09/10] ip: display netrom link type For a NETROM "ip link show dev nr0" will show 4: nr0: mtu 236 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000 link/generic 88:98:6a:a4:84:40:0a brd 00:00:00:00:00:00:00 But rather link/netrom is expected to be displayed. Signed-off-by: Ralf Baechle Signed-off-by: Stephen Hemminger --- lib/ll_types.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ll_types.c b/lib/ll_types.c index bd8893ed..32d04b5a 100644 --- a/lib/ll_types.c +++ b/lib/ll_types.c @@ -32,7 +32,7 @@ static const struct { int type; const char *name; } arphrd_names[] = { -{ 0, "generic" }, +__PF(NETROM,netrom) __PF(ETHER,ether) __PF(EETHER,eether) __PF(AX25,ax25) From aed63ae1acb9ab7acf9ef7dbd38ad465918d39ac Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Thu, 4 Apr 2019 19:07:38 +0300 Subject: [PATCH 10/10] ip xfrm: support setting/printing XFRMA_IF_ID attribute in states/policies The XFRMA_IF_ID attribute is set in policies/states for them to be associated with an XFRM interface (4.19+). Add support for setting / displaying this attribute. Note that 0 is a valid value therefore set XFRMA_IF_ID if any value was provided in command line. Tested-by: Antony Antony Signed-off-by: Eyal Birger Signed-off-by: Stephen Hemminger --- ip/ipxfrm.c | 8 ++++++++ ip/xfrm_policy.c | 12 +++++++++++- ip/xfrm_state.c | 11 +++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/ip/ipxfrm.c b/ip/ipxfrm.c index b153b863..32f56093 100644 --- a/ip/ipxfrm.c +++ b/ip/ipxfrm.c @@ -891,6 +891,14 @@ void xfrm_xfrma_print(struct rtattr *tb[], __u16 family, (xuo->flags & XFRM_OFFLOAD_INBOUND) ? "in" : "out"); fprintf(fp, "%s", _SL_); } + if (tb[XFRMA_IF_ID]) { + __u32 if_id = rta_getattr_u32(tb[XFRMA_IF_ID]); + + if (prefix) + fputs(prefix, fp); + fprintf(fp, "if_id %#x", if_id); + fprintf(fp, "%s", _SL_); + } } static int xfrm_selector_iszero(struct xfrm_selector *s) diff --git a/ip/xfrm_policy.c b/ip/xfrm_policy.c index feccaada..4a63e9ab 100644 --- a/ip/xfrm_policy.c +++ b/ip/xfrm_policy.c @@ -55,7 +55,7 @@ static void usage(void) fprintf(stderr, "Usage: ip xfrm policy { add | update } SELECTOR dir DIR [ ctx CTX ]\n"); fprintf(stderr, " [ mark MARK [ mask MASK ] ] [ index INDEX ] [ ptype PTYPE ]\n"); fprintf(stderr, " [ action ACTION ] [ priority PRIORITY ] [ flag FLAG-LIST ]\n"); - fprintf(stderr, " [ LIMIT-LIST ] [ TMPL-LIST ]\n"); + fprintf(stderr, " [ if_id IF_ID ] [ LIMIT-LIST ] [ TMPL-LIST ]\n"); fprintf(stderr, "Usage: ip xfrm policy { delete | get } { SELECTOR | index INDEX } dir DIR\n"); fprintf(stderr, " [ ctx CTX ] [ mark MARK [ mask MASK ] ] [ ptype PTYPE ]\n"); fprintf(stderr, "Usage: ip xfrm policy { deleteall | list } [ nosock ] [ SELECTOR ] [ dir DIR ]\n"); @@ -270,6 +270,8 @@ static int xfrm_policy_modify(int cmd, unsigned int flags, int argc, char **argv struct xfrm_user_sec_ctx sctx; char str[CTX_BUF_SIZE]; } ctx = {}; + bool is_if_id_set = false; + __u32 if_id = 0; while (argc > 0) { if (strcmp(*argv, "dir") == 0) { @@ -338,6 +340,11 @@ static int xfrm_policy_modify(int cmd, unsigned int flags, int argc, char **argv xfrm_tmpl_parse(tmpl, &argc, &argv); tmpls_len += sizeof(*tmpl); + } else if (strcmp(*argv, "if_id") == 0) { + NEXT_ARG(); + if (get_u32(&if_id, *argv, 0)) + invarg("IF_ID value is invalid", *argv); + is_if_id_set = true; } else { if (selp) duparg("unknown", *argv); @@ -380,6 +387,9 @@ static int xfrm_policy_modify(int cmd, unsigned int flags, int argc, char **argv (void *)&ctx, ctx.sctx.len); } + if (is_if_id_set) + addattr32(&req.n, sizeof(req.buf), XFRMA_IF_ID, if_id); + if (rtnl_open_byproto(&rth, 0, NETLINK_XFRM) < 0) exit(1); diff --git a/ip/xfrm_state.c b/ip/xfrm_state.c index 09292da9..93601437 100644 --- a/ip/xfrm_state.c +++ b/ip/xfrm_state.c @@ -62,6 +62,7 @@ static void usage(void) fprintf(stderr, " [ coa ADDR[/PLEN] ] [ ctx CTX ] [ extra-flag EXTRA-FLAG-LIST ]\n"); fprintf(stderr, " [ offload [dev DEV] dir DIR ]\n"); fprintf(stderr, " [ output-mark OUTPUT-MARK ]\n"); ++ fprintf(stderr, " [ if_id IF_ID ]\n"); fprintf(stderr, "Usage: ip xfrm state allocspi ID [ mode MODE ] [ mark MARK [ mask MASK ] ]\n"); fprintf(stderr, " [ reqid REQID ] [ seq SEQ ] [ min SPI max SPI ]\n"); fprintf(stderr, "Usage: ip xfrm state { delete | get } ID [ mark MARK [ mask MASK ] ]\n"); @@ -326,6 +327,8 @@ static int xfrm_state_modify(int cmd, unsigned int flags, int argc, char **argv) char str[CTX_BUF_SIZE]; } ctx = {}; __u32 output_mark = 0; + bool is_if_id_set = false; + __u32 if_id = 0; while (argc > 0) { if (strcmp(*argv, "mode") == 0) { @@ -445,6 +448,11 @@ static int xfrm_state_modify(int cmd, unsigned int flags, int argc, char **argv) NEXT_ARG(); if (get_u32(&output_mark, *argv, 0)) invarg("value after \"output-mark\" is invalid", *argv); + } else if (strcmp(*argv, "if_id") == 0) { + NEXT_ARG(); + if (get_u32(&if_id, *argv, 0)) + invarg("value after \"if_id\" is invalid", *argv); + is_if_id_set = true; } else { /* try to assume ALGO */ int type = xfrm_algotype_getbyname(*argv); @@ -627,6 +635,9 @@ static int xfrm_state_modify(int cmd, unsigned int flags, int argc, char **argv) } } + if (is_if_id_set) + addattr32(&req.n, sizeof(req.buf), XFRMA_IF_ID, if_id); + if (xfrm_xfrmproto_is_ipsec(req.xsinfo.id.proto)) { switch (req.xsinfo.mode) { case XFRM_MODE_TRANSPORT: