From 4aa0c9c9f8dbd40317ef34db11726d50e5fa8440 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Tue, 14 Apr 2020 09:57:52 +0300 Subject: [PATCH 01/33] devlink: Add devlink health auto_dump command support Add support for configuring auto_dump attribute per reporter. With this attribute, one can indicate whether the devlink kernel core should execute automatic dump on error. The change will be reflected in show, set and man commands. Signed-off-by: Eran Ben Elisha Reviewed-by: Aya Levin Signed-off-by: David Ahern --- devlink/devlink.c | 19 ++++++++++++++++++- man/man8/devlink-health.8 | 11 +++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/devlink/devlink.c b/devlink/devlink.c index f67fe6dd..816b5de9 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -292,6 +292,7 @@ static void ifname_map_free(struct ifname_map *ifname_map) #define DL_OPT_TRAP_POLICER_ID BIT(34) #define DL_OPT_TRAP_POLICER_RATE BIT(35) #define DL_OPT_TRAP_POLICER_BURST BIT(36) +#define DL_OPT_HEALTH_REPORTER_AUTO_DUMP BIT(37) struct dl_opts { uint64_t present; /* flags of present items */ @@ -328,6 +329,7 @@ struct dl_opts { const char *reporter_name; uint64_t reporter_graceful_period; bool reporter_auto_recover; + bool reporter_auto_dump; const char *trap_name; const char *trap_group_name; enum devlink_trap_action trap_action; @@ -1474,6 +1476,13 @@ static int dl_argv_parse(struct dl *dl, uint64_t o_required, if (err) return err; o_found |= DL_OPT_HEALTH_REPORTER_AUTO_RECOVER; + } else if (dl_argv_match(dl, "auto_dump") && + (o_all & DL_OPT_HEALTH_REPORTER_AUTO_DUMP)) { + dl_arg_inc(dl); + err = dl_argv_bool(dl, &opts->reporter_auto_dump); + if (err) + return err; + o_found |= DL_OPT_HEALTH_REPORTER_AUTO_DUMP; } else if (dl_argv_match(dl, "trap") && (o_all & DL_OPT_TRAP_NAME)) { dl_arg_inc(dl); @@ -1656,6 +1665,9 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl) if (opts->present & DL_OPT_HEALTH_REPORTER_AUTO_RECOVER) mnl_attr_put_u8(nlh, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, opts->reporter_auto_recover); + if (opts->present & DL_OPT_HEALTH_REPORTER_AUTO_DUMP) + mnl_attr_put_u8(nlh, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, + opts->reporter_auto_dump); if (opts->present & DL_OPT_TRAP_NAME) mnl_attr_put_strz(nlh, DEVLINK_ATTR_TRAP_NAME, opts->trap_name); @@ -6505,7 +6517,8 @@ static int cmd_health_set_params(struct dl *dl) NLM_F_REQUEST | NLM_F_ACK); err = dl_argv_parse(dl, DL_OPT_HANDLE | DL_OPT_HEALTH_REPORTER_NAME, DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD | - DL_OPT_HEALTH_REPORTER_AUTO_RECOVER); + DL_OPT_HEALTH_REPORTER_AUTO_RECOVER | + DL_OPT_HEALTH_REPORTER_AUTO_DUMP); if (err) return err; @@ -6919,6 +6932,9 @@ static void pr_out_health(struct dl *dl, struct nlattr **tb_health) if (tb[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) print_bool(PRINT_ANY, "auto_recover", " auto_recover %s", mnl_attr_get_u8(tb[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])); + if (tb[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) + print_bool(PRINT_ANY, "auto_dump", " auto_dump %s", + mnl_attr_get_u8(tb[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])); __pr_out_indent_dec(); pr_out_handle_end(dl); @@ -6975,6 +6991,7 @@ static void cmd_health_help(void) pr_err(" devlink health set DEV reporter REPORTER_NAME\n"); pr_err(" [ grace_period MSEC ]\n"); pr_err(" [ auto_recover { true | false } ]\n"); + pr_err(" [ auto_dump { true | false } ]\n"); } static int cmd_health(struct dl *dl) diff --git a/man/man8/devlink-health.8 b/man/man8/devlink-health.8 index 70a86cf0..8a3c77be 100644 --- a/man/man8/devlink-health.8 +++ b/man/man8/devlink-health.8 @@ -58,6 +58,9 @@ devlink-health \- devlink health reporting and recovery .RI "[ " .BR auto_recover " { " true " | " false " } " .RI "]" +.RI "[ " +.BR auto_dump " { " true " | " false " } " +.RI "]" .ti -8 .B devlink health help @@ -131,8 +134,8 @@ the next "devlink health dump show" command. - specifies the reporter's name registered on the devlink device. .SS devlink health set - Configure health reporter. -Please note that this command is not supported on a reporter which -doesn't support a recovery method. +Please note that some params are not supported on a reporter which +doesn't support a recovery or dump method. .PP .I "DEV" @@ -150,6 +153,10 @@ Time interval between consecutive auto recoveries. .BR auto_recover " { " true " | " false " } " Indicates whether the devlink should execute automatic recover on error. +.TP +.BR auto_dump " { " true " | " false " } " +Indicates whether the devlink should execute automatic dump on error. + .SH "EXAMPLES" .PP devlink health show From 998534c99eceafe580746caa5df1e9545fb59cde Mon Sep 17 00:00:00 2001 From: Mark Starovoytov Date: Fri, 24 Apr 2020 11:38:56 +0300 Subject: [PATCH 02/33] macsec: add support for MAC offload This patch enables MAC HW offload usage in iproute, since MACSec implementation supports it now. Signed-off-by: Mark Starovoytov Signed-off-by: Igor Russkikh Signed-off-by: David Ahern --- ip/ipmacsec.c | 3 ++- man/man8/ip-macsec.8 | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ip/ipmacsec.c b/ip/ipmacsec.c index 4e500e4e..d214b101 100644 --- a/ip/ipmacsec.c +++ b/ip/ipmacsec.c @@ -34,6 +34,7 @@ static const char * const validate_str[] = { static const char * const offload_str[] = { [MACSEC_OFFLOAD_OFF] = "off", [MACSEC_OFFLOAD_PHY] = "phy", + [MACSEC_OFFLOAD_MAC] = "mac", }; struct sci { @@ -98,7 +99,7 @@ static void ipmacsec_usage(void) " ip macsec del DEV rx SCI sa { 0..3 }\n" " ip macsec show\n" " ip macsec show DEV\n" - " ip macsec offload DEV [ off | phy ]\n" + " ip macsec offload DEV [ off | phy | mac ]\n" "where OPTS := [ pn ] [ on | off ]\n" " ID := 128-bit hex string\n" " KEY := 128-bit or 256-bit hex string\n" diff --git a/man/man8/ip-macsec.8 b/man/man8/ip-macsec.8 index d5f9d240..b2ee7bee 100644 --- a/man/man8/ip-macsec.8 +++ b/man/man8/ip-macsec.8 @@ -54,7 +54,7 @@ ip-macsec \- MACsec device configuration .RI "{ " 0..3 " }" .BI "ip macsec offload " DEV -.RB "{ " off " | " phy " }" +.RB "{ " off " | " phy " | " mac " }" .B ip macsec show .RI [ " DEV " ] From bcbeb35ca4351928b4b8e4cb28802880312be0f1 Mon Sep 17 00:00:00 2001 From: Mark Starovoytov Date: Fri, 24 Apr 2020 11:38:57 +0300 Subject: [PATCH 03/33] macsec: add support for specifying offload at link add time This patch adds support for configuring offload mode upon MACsec device creation. If offload mode is not specified, then netlink attribute is not added. Default behavior on the kernel side in this case is backward-compatible (offloading is disabled by default). Example: $ ip link add link eth0 macsec0 type macsec port 11 encrypt on offload mac Signed-off-by: Mark Starovoytov Signed-off-by: Igor Russkikh Signed-off-by: David Ahern --- ip/ipmacsec.c | 20 ++++++++++++++++++++ man/man8/ip-macsec.8 | 8 +++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/ip/ipmacsec.c b/ip/ipmacsec.c index d214b101..18289ecd 100644 --- a/ip/ipmacsec.c +++ b/ip/ipmacsec.c @@ -1220,6 +1220,15 @@ static void macsec_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) validate_to_str(val)); } + if (tb[IFLA_MACSEC_OFFLOAD]) { + __u8 val = rta_getattr_u8(tb[IFLA_MACSEC_OFFLOAD]); + + print_string(PRINT_ANY, + "offload", + "offload %s ", + offload_to_str(val)); + } + const char *inc_sci, *es, *replay; if (is_json_context()) { @@ -1268,6 +1277,7 @@ static void usage(FILE *f) " [ replay { on | off} window { 0..2^32-1 } ]\n" " [ validate { strict | check | disabled } ]\n" " [ encodingsa { 0..3 } ]\n" + " [ offload { mac | phy | off } ]\n" ); } @@ -1277,6 +1287,7 @@ static int macsec_parse_opt(struct link_util *lu, int argc, char **argv, int ret; __u8 encoding_sa = 0xff; __u32 window = -1; + enum macsec_offload offload; struct cipher_args cipher = {0}; enum macsec_validation_type validate; bool es = false, scb = false, send_sci = false; @@ -1398,6 +1409,15 @@ static int macsec_parse_opt(struct link_util *lu, int argc, char **argv, ret = get_an(&encoding_sa, *argv); if (ret) invarg("expected an { 0..3 }", *argv); + } else if (strcmp(*argv, "offload") == 0) { + NEXT_ARG(); + ret = one_of("offload", *argv, + offload_str, ARRAY_SIZE(offload_str), + (int *)&offload); + if (ret != 0) + return ret; + addattr8(n, MACSEC_BUFLEN, + IFLA_MACSEC_OFFLOAD, offload); } else { fprintf(stderr, "macsec: unknown command \"%s\"?\n", *argv); diff --git a/man/man8/ip-macsec.8 b/man/man8/ip-macsec.8 index b2ee7bee..8e9175c5 100644 --- a/man/man8/ip-macsec.8 +++ b/man/man8/ip-macsec.8 @@ -23,6 +23,8 @@ ip-macsec \- MACsec device configuration ] [ .BR validate " { " strict " | " check " | " disabled " } ] [" .BI encodingsa " SA" +] [ +.BR offload " { " off " | " phy " | " mac " }" ] .BI "ip macsec add " DEV " tx sa" @@ -86,7 +88,7 @@ type. .SH EXAMPLES .PP -.SS Create a MACsec device on link eth0 +.SS Create a MACsec device on link eth0 (offload is disabled by default) .nf # ip link add link eth0 macsec0 type macsec port 11 encrypt on .PP @@ -109,6 +111,10 @@ type. .SS Configure offloading on an interface .nf # ip macsec offload macsec0 phy +.PP +.SS Configure offloading upon MACsec device creation +.nf +# ip link add link eth0 macsec0 type macsec port 11 encrypt on offload mac .SH NOTES This tool can be used to configure the 802.1AE keys of the interface. Note that 802.1AE uses GCM-AES From 02ade5a8ea1c23201a99d8cdf7e02a6ba90d7718 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 29 Apr 2020 16:41:39 +0000 Subject: [PATCH 04/33] Update kernel headers and import mptcp.h Update kernel headers to commit 790ab249b55d ("net: ethernet: fec: Prevent MII event after MII_SPEED write") and import mptcp.h Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 2 +- include/uapi/linux/if.h | 1 + include/uapi/linux/if_bridge.h | 42 ++++++++++++++++ include/uapi/linux/if_ether.h | 1 + include/uapi/linux/if_link.h | 1 + include/uapi/linux/mptcp.h | 89 ++++++++++++++++++++++++++++++++++ 6 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/mptcp.h diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c7b2ffb2..60684b7b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1642,7 +1642,7 @@ union bpf_attr { * ifindex, but doesn't require a map to do so. * Return * **XDP_REDIRECT** on success, or the value of the two lower bits - * of the **flags* argument on error. + * of the *flags* argument on error. * * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index 074b14e3..b287b2a0 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -176,6 +176,7 @@ enum { enum { IF_LINK_MODE_DEFAULT, IF_LINK_MODE_DORMANT, /* limit upward transition to dormant */ + IF_LINK_MODE_TESTING, /* limit upward transition to testing */ }; /* diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index cb581cc0..a4ac9f55 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -120,6 +120,7 @@ enum { IFLA_BRIDGE_MODE, IFLA_BRIDGE_VLAN_INFO, IFLA_BRIDGE_VLAN_TUNNEL_INFO, + IFLA_BRIDGE_MRP, __IFLA_BRIDGE_MAX, }; #define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1) @@ -157,6 +158,47 @@ struct bridge_vlan_xstats { __u32 pad2; }; +enum { + IFLA_BRIDGE_MRP_UNSPEC, + IFLA_BRIDGE_MRP_INSTANCE, + IFLA_BRIDGE_MRP_PORT_STATE, + IFLA_BRIDGE_MRP_PORT_ROLE, + IFLA_BRIDGE_MRP_RING_STATE, + IFLA_BRIDGE_MRP_RING_ROLE, + IFLA_BRIDGE_MRP_START_TEST, + __IFLA_BRIDGE_MRP_MAX, +}; + +struct br_mrp_instance { + __u32 ring_id; + __u32 p_ifindex; + __u32 s_ifindex; +}; + +struct br_mrp_port_role { + __u32 ring_id; + __u32 role; +}; + +struct br_mrp_ring_state { + __u32 ring_id; + __u32 ring_state; +}; + +struct br_mrp_ring_role { + __u32 ring_id; + __u32 ring_role; +}; + +struct br_mrp_start_test { + __u32 ring_id; + __u32 interval; + __u32 max_miss; + __u32 period; +}; + +#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1) + struct bridge_stp_xstats { __u64 transition_blk; __u64 transition_fwd; diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 728c42df..1a0c7dfe 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -92,6 +92,7 @@ #define ETH_P_PREAUTH 0x88C7 /* 802.11 Preauthentication */ #define ETH_P_TIPC 0x88CA /* TIPC */ #define ETH_P_LLDP 0x88CC /* Link Layer Discovery Protocol */ +#define ETH_P_MRP 0x88E3 /* Media Redundancy Protocol */ #define ETH_P_MACSEC 0x88E5 /* 802.1ae MACsec */ #define ETH_P_8021AH 0x88E7 /* 802.1ah Backbone Service Tag */ #define ETH_P_MVRP 0x88F5 /* 802.1Q MVRP */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 978f98c7..a8901a39 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -341,6 +341,7 @@ enum { IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h new file mode 100644 index 00000000..009b8f0b --- /dev/null +++ b/include/uapi/linux/mptcp.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _MPTCP_H +#define _MPTCP_H + +#include +#include + +#define MPTCP_SUBFLOW_FLAG_MCAP_REM _BITUL(0) +#define MPTCP_SUBFLOW_FLAG_MCAP_LOC _BITUL(1) +#define MPTCP_SUBFLOW_FLAG_JOIN_REM _BITUL(2) +#define MPTCP_SUBFLOW_FLAG_JOIN_LOC _BITUL(3) +#define MPTCP_SUBFLOW_FLAG_BKUP_REM _BITUL(4) +#define MPTCP_SUBFLOW_FLAG_BKUP_LOC _BITUL(5) +#define MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED _BITUL(6) +#define MPTCP_SUBFLOW_FLAG_CONNECTED _BITUL(7) +#define MPTCP_SUBFLOW_FLAG_MAPVALID _BITUL(8) + +enum { + MPTCP_SUBFLOW_ATTR_UNSPEC, + MPTCP_SUBFLOW_ATTR_TOKEN_REM, + MPTCP_SUBFLOW_ATTR_TOKEN_LOC, + MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, + MPTCP_SUBFLOW_ATTR_MAP_SEQ, + MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, + MPTCP_SUBFLOW_ATTR_SSN_OFFSET, + MPTCP_SUBFLOW_ATTR_MAP_DATALEN, + MPTCP_SUBFLOW_ATTR_FLAGS, + MPTCP_SUBFLOW_ATTR_ID_REM, + MPTCP_SUBFLOW_ATTR_ID_LOC, + MPTCP_SUBFLOW_ATTR_PAD, + __MPTCP_SUBFLOW_ATTR_MAX +}; + +#define MPTCP_SUBFLOW_ATTR_MAX (__MPTCP_SUBFLOW_ATTR_MAX - 1) + +/* netlink interface */ +#define MPTCP_PM_NAME "mptcp_pm" +#define MPTCP_PM_CMD_GRP_NAME "mptcp_pm_cmds" +#define MPTCP_PM_VER 0x1 + +/* + * ATTR types defined for MPTCP + */ +enum { + MPTCP_PM_ATTR_UNSPEC, + + MPTCP_PM_ATTR_ADDR, /* nested address */ + MPTCP_PM_ATTR_RCV_ADD_ADDRS, /* u32 */ + MPTCP_PM_ATTR_SUBFLOWS, /* u32 */ + + __MPTCP_PM_ATTR_MAX +}; + +#define MPTCP_PM_ATTR_MAX (__MPTCP_PM_ATTR_MAX - 1) + +enum { + MPTCP_PM_ADDR_ATTR_UNSPEC, + + MPTCP_PM_ADDR_ATTR_FAMILY, /* u16 */ + MPTCP_PM_ADDR_ATTR_ID, /* u8 */ + MPTCP_PM_ADDR_ATTR_ADDR4, /* struct in_addr */ + MPTCP_PM_ADDR_ATTR_ADDR6, /* struct in6_addr */ + MPTCP_PM_ADDR_ATTR_PORT, /* u16 */ + MPTCP_PM_ADDR_ATTR_FLAGS, /* u32 */ + MPTCP_PM_ADDR_ATTR_IF_IDX, /* s32 */ + + __MPTCP_PM_ADDR_ATTR_MAX +}; + +#define MPTCP_PM_ADDR_ATTR_MAX (__MPTCP_PM_ADDR_ATTR_MAX - 1) + +#define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) +#define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) +#define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) + +enum { + MPTCP_PM_CMD_UNSPEC, + + MPTCP_PM_CMD_ADD_ADDR, + MPTCP_PM_CMD_DEL_ADDR, + MPTCP_PM_CMD_GET_ADDR, + MPTCP_PM_CMD_FLUSH_ADDRS, + MPTCP_PM_CMD_SET_LIMITS, + MPTCP_PM_CMD_GET_LIMITS, + + __MPTCP_PM_CMD_AFTER_LAST +}; + +#endif /* _MPTCP_H */ From 7e0767cd862bb5dd2d41c41c5e6f55d633f953ea Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 23 Apr 2020 15:37:08 +0200 Subject: [PATCH 05/33] add support for mptcp netlink interface Implement basic commands to: - manipulate MPTCP endpoints list - manipulate MPTCP connection limits Examples: 1. Allows multiple subflows per MPTCP connection $ ip mptcp limits set subflows 2 2. Accept ADD_ADDR announcement from the peer (server): $ ip mptcp limits set add_addr_accepted 2 3. Add a ipv4 address to be annunced for backup subflows: $ ip mptcp endpoint add 10.99.1.2 signal backup 4. Add an ipv6 address used as source for additional subflows: $ ip mptcp endpoint add 2001::2 subflow Signed-off-by: Paolo Abeni Signed-off-by: David Ahern --- ip/Makefile | 2 +- ip/ip.c | 3 +- ip/ip_common.h | 1 + ip/ipmptcp.c | 436 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 440 insertions(+), 2 deletions(-) create mode 100644 ip/ipmptcp.c diff --git a/ip/Makefile b/ip/Makefile index 5ab78d7d..8735b8e4 100644 --- a/ip/Makefile +++ b/ip/Makefile @@ -11,7 +11,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \ iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o \ iplink_geneve.o iplink_vrf.o iproute_lwtunnel.o ipmacsec.o ipila.o \ ipvrf.o iplink_xstats.o ipseg6.o iplink_netdevsim.o iplink_rmnet.o \ - ipnexthop.o + ipnexthop.o ipmptcp.o RTMONOBJ=rtmon.o diff --git a/ip/ip.c b/ip/ip.c index 90392c2a..4249df03 100644 --- a/ip/ip.c +++ b/ip/ip.c @@ -51,7 +51,7 @@ static void usage(void) "where OBJECT := { link | address | addrlabel | route | rule | neigh | ntable |\n" " tunnel | tuntap | maddress | mroute | mrule | monitor | xfrm |\n" " netns | l2tp | fou | macsec | tcp_metrics | token | netconf | ila |\n" - " vrf | sr | nexthop }\n" + " vrf | sr | nexthop | mptcp }\n" " OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n" " -h[uman-readable] | -iec | -j[son] | -p[retty] |\n" " -f[amily] { inet | inet6 | mpls | bridge | link } |\n" @@ -103,6 +103,7 @@ static const struct cmd { { "vrf", do_ipvrf}, { "sr", do_seg6 }, { "nexthop", do_ipnh }, + { "mptcp", do_mptcp }, { "help", do_help }, { 0 } }; diff --git a/ip/ip_common.h b/ip/ip_common.h index 879287e3..d604f755 100644 --- a/ip/ip_common.h +++ b/ip/ip_common.h @@ -83,6 +83,7 @@ void vrf_reset(void); int netns_identify_pid(const char *pidstr, char *name, int len); int do_seg6(int argc, char **argv); int do_ipnh(int argc, char **argv); +int do_mptcp(int argc, char **argv); int iplink_get(char *name, __u32 filt_mask); int iplink_ifla_xstats(int argc, char **argv); diff --git a/ip/ipmptcp.c b/ip/ipmptcp.c new file mode 100644 index 00000000..bc12418b --- /dev/null +++ b/ip/ipmptcp.c @@ -0,0 +1,436 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#include +#include + +#include "utils.h" +#include "ip_common.h" +#include "libgenl.h" +#include "json_print.h" + +static void usage(void) +{ + fprintf(stderr, + "Usage: ip mptcp endpoint add ADDRESS [ dev NAME ] [ id ID ]\n" + " [ FLAG-LIST ]\n" + " ip mptcp endpoint delete id ID\n" + " ip mptcp endpoint show [ id ID ]\n" + " ip mptcp endpoint flush\n" + " ip mptcp limits set [ subflows NR ] [ add_addr_accepted NR ]\n" + " ip mptcp limits show\n" + "FLAG-LIST := [ FLAG-LIST ] FLAG\n" + "FLAG := [ signal | subflow | backup ]\n"); + + exit(-1); +} + +/* netlink socket */ +static struct rtnl_handle genl_rth = { .fd = -1 }; +static int genl_family = -1; + +#define MPTCP_BUFLEN 4096 +#define MPTCP_REQUEST(_req, _cmd, _flags) \ + GENL_REQUEST(_req, MPTCP_BUFLEN, genl_family, 0, \ + MPTCP_PM_VER, _cmd, _flags) + +/* Mapping from argument to address flag mask */ +static const struct { + const char *name; + unsigned long value; +} mptcp_addr_flag_names[] = { + { "signal", MPTCP_PM_ADDR_FLAG_SIGNAL }, + { "subflow", MPTCP_PM_ADDR_FLAG_SUBFLOW }, + { "backup", MPTCP_PM_ADDR_FLAG_BACKUP }, +}; + +static void print_mptcp_addr_flags(unsigned int flags) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(mptcp_addr_flag_names); i++) { + unsigned long mask = mptcp_addr_flag_names[i].value; + + if (flags & mask) { + print_string(PRINT_FP, NULL, "%s ", + mptcp_addr_flag_names[i].name); + print_bool(PRINT_JSON, + mptcp_addr_flag_names[i].name, NULL, true); + } + + flags &= ~mask; + } + + if (flags) { + /* unknown flags */ + SPRINT_BUF(b1); + + snprintf(b1, sizeof(b1), "%02x", flags); + print_string(PRINT_ANY, "rawflags", "rawflags %s ", b1); + } +} + +static int get_flags(const char *arg, __u32 *flags) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(mptcp_addr_flag_names); i++) { + if (strcmp(arg, mptcp_addr_flag_names[i].name)) + continue; + + *flags |= mptcp_addr_flag_names[i].value; + return 0; + } + return -1; +} + +static int mptcp_parse_opt(int argc, char **argv, struct nlmsghdr *n, + bool adding) +{ + struct rtattr *attr_addr; + bool addr_set = false; + inet_prefix address; + bool id_set = false; + __u32 index = 0; + __u32 flags = 0; + __u8 id = 0; + + ll_init_map(&rth); + while (argc > 0) { + if (get_flags(*argv, &flags) == 0) { + } else if (matches(*argv, "id") == 0) { + NEXT_ARG(); + + if (get_u8(&id, *argv, 0)) + invarg("invalid ID\n", *argv); + id_set = true; + } else if (matches(*argv, "dev") == 0) { + const char *ifname; + + NEXT_ARG(); + + ifname = *argv; + + if (check_ifname(ifname)) + invarg("invalid interface name\n", ifname); + + index = ll_name_to_index(ifname); + + if (!index) + invarg("device does not exist\n", ifname); + + } else if (get_addr(&address, *argv, AF_UNSPEC) == 0) { + addr_set = true; + } else { + invarg("unknown argument", *argv); + } + NEXT_ARG_FWD(); + } + + if (!addr_set && adding) + missarg("ADDRESS"); + + if (!id_set && !adding) + missarg("ID"); + + attr_addr = addattr_nest(n, MPTCP_BUFLEN, + MPTCP_PM_ATTR_ADDR | NLA_F_NESTED); + if (id_set) + addattr8(n, MPTCP_BUFLEN, MPTCP_PM_ADDR_ATTR_ID, id); + if (flags) + addattr32(n, MPTCP_BUFLEN, MPTCP_PM_ADDR_ATTR_FLAGS, flags); + if (index) + addattr32(n, MPTCP_BUFLEN, MPTCP_PM_ADDR_ATTR_IF_IDX, index); + if (addr_set) { + int type; + + addattr16(n, MPTCP_BUFLEN, MPTCP_PM_ADDR_ATTR_FAMILY, + address.family); + type = address.family == AF_INET ? MPTCP_PM_ADDR_ATTR_ADDR4 : + MPTCP_PM_ADDR_ATTR_ADDR6; + addattr_l(n, MPTCP_BUFLEN, type, &address.data, + address.bytelen); + } + + addattr_nest_end(n, attr_addr); + return 0; +} + +static int mptcp_addr_modify(int argc, char **argv, int cmd) +{ + MPTCP_REQUEST(req, cmd, NLM_F_REQUEST); + int ret; + + ret = mptcp_parse_opt(argc, argv, &req.n, cmd == MPTCP_PM_CMD_ADD_ADDR); + if (ret) + return ret; + + if (rtnl_talk(&genl_rth, &req.n, NULL) < 0) + return -2; + + return 0; +} + +static int print_mptcp_addrinfo(struct rtattr *addrinfo) +{ + struct rtattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + __u8 family = AF_UNSPEC, addr_attr_type; + const char *ifname; + unsigned int flags; + int index; + __u16 id; + + parse_rtattr_nested(tb, MPTCP_PM_ADDR_ATTR_MAX, addrinfo); + + open_json_object(NULL); + if (tb[MPTCP_PM_ADDR_ATTR_FAMILY]) + family = rta_getattr_u8(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); + + addr_attr_type = family == AF_INET ? MPTCP_PM_ADDR_ATTR_ADDR4 : + MPTCP_PM_ADDR_ATTR_ADDR6; + if (tb[addr_attr_type]) { + print_string(PRINT_ANY, "address", "%s ", + format_host_rta(family, tb[addr_attr_type])); + } + if (tb[MPTCP_PM_ADDR_ATTR_ID]) { + id = rta_getattr_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); + print_uint(PRINT_ANY, "id", "id %u ", id); + } + if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) { + flags = rta_getattr_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); + print_mptcp_addr_flags(flags); + } + if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { + index = rta_getattr_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); + ifname = index ? ll_index_to_name(index) : NULL; + + if (ifname) + print_string(PRINT_ANY, "dev", "dev %s ", ifname); + } + + close_json_object(); + print_string(PRINT_FP, NULL, "\n", NULL); + fflush(stdout); + + return 0; +} + +static int print_mptcp_addr(struct nlmsghdr *n, void *arg) +{ + struct rtattr *tb[MPTCP_PM_ATTR_MAX + 1]; + struct genlmsghdr *ghdr; + struct rtattr *addrinfo; + int len = n->nlmsg_len; + + if (n->nlmsg_type != genl_family) + return 0; + + len -= NLMSG_LENGTH(GENL_HDRLEN); + if (len < 0) + return -1; + + ghdr = NLMSG_DATA(n); + parse_rtattr_flags(tb, MPTCP_PM_ATTR_MAX, (void *) ghdr + GENL_HDRLEN, + len, NLA_F_NESTED); + addrinfo = tb[MPTCP_PM_ATTR_ADDR]; + if (!addrinfo) + return -1; + + ll_init_map(&rth); + return print_mptcp_addrinfo(addrinfo); +} + +static int mptcp_addr_dump(void) +{ + MPTCP_REQUEST(req, MPTCP_PM_CMD_GET_ADDR, NLM_F_REQUEST | NLM_F_DUMP); + + if (rtnl_send(&genl_rth, &req.n, req.n.nlmsg_len) < 0) { + perror("Cannot send show request"); + exit(1); + } + + new_json_obj(json); + + if (rtnl_dump_filter(&genl_rth, print_mptcp_addr, stdout) < 0) { + fprintf(stderr, "Dump terminated\n"); + delete_json_obj(); + fflush(stdout); + return -2; + } + + close_json_object(); + fflush(stdout); + return 0; +} + +static int mptcp_addr_show(int argc, char **argv) +{ + MPTCP_REQUEST(req, MPTCP_PM_CMD_GET_ADDR, NLM_F_REQUEST); + struct nlmsghdr *answer; + int ret; + + if (!argv) + return mptcp_addr_dump(); + + ret = mptcp_parse_opt(argc, argv, &req.n, false); + if (ret) + return ret; + + if (rtnl_talk(&genl_rth, &req.n, &answer) < 0) + return -2; + + return print_mptcp_addr(answer, stdout); +} + +static int mptcp_addr_flush(int argc, char **argv) +{ + MPTCP_REQUEST(req, MPTCP_PM_CMD_FLUSH_ADDRS, NLM_F_REQUEST); + + if (rtnl_talk(&genl_rth, &req.n, NULL) < 0) + return -2; + + return 0; +} + +static int mptcp_parse_limit(int argc, char **argv, struct nlmsghdr *n) +{ + bool set_rcv_add_addrs = false; + bool set_subflows = false; + __u32 rcv_add_addrs = 0; + __u32 subflows = 0; + + while (argc > 0) { + if (matches(*argv, "subflows") == 0) { + NEXT_ARG(); + + if (get_u32(&subflows, *argv, 0)) + invarg("invalid subflows\n", *argv); + set_subflows = true; + } else if (matches(*argv, "add_addr_accepted") == 0) { + NEXT_ARG(); + + if (get_u32(&rcv_add_addrs, *argv, 0)) + invarg("invalid add_addr_accepted\n", *argv); + set_rcv_add_addrs = true; + } else { + invarg("unknown limit", *argv); + } + NEXT_ARG_FWD(); + } + + if (set_rcv_add_addrs) + addattr32(n, MPTCP_BUFLEN, MPTCP_PM_ATTR_RCV_ADD_ADDRS, + rcv_add_addrs); + if (set_subflows) + addattr32(n, MPTCP_BUFLEN, MPTCP_PM_ATTR_SUBFLOWS, subflows); + return set_rcv_add_addrs || set_subflows; +} + +static int print_mptcp_limit(struct nlmsghdr *n, void *arg) +{ + struct rtattr *tb[MPTCP_PM_ATTR_MAX + 1]; + struct genlmsghdr *ghdr; + int len = n->nlmsg_len; + __u32 val; + + if (n->nlmsg_type != genl_family) + return 0; + + len -= NLMSG_LENGTH(GENL_HDRLEN); + if (len < 0) + return -1; + + ghdr = NLMSG_DATA(n); + parse_rtattr(tb, MPTCP_PM_ATTR_MAX, (void *) ghdr + GENL_HDRLEN, len); + + open_json_object(NULL); + if (tb[MPTCP_PM_ATTR_RCV_ADD_ADDRS]) { + val = rta_getattr_u32(tb[MPTCP_PM_ATTR_RCV_ADD_ADDRS]); + + print_uint(PRINT_ANY, "add_addr_accepted", + "add_addr_accepted %d ", val); + } + + if (tb[MPTCP_PM_ATTR_SUBFLOWS]) { + val = rta_getattr_u32(tb[MPTCP_PM_ATTR_SUBFLOWS]); + + print_uint(PRINT_ANY, "subflows", "subflows %d ", val); + } + print_string(PRINT_FP, NULL, "%s", "\n"); + fflush(stdout); + close_json_object(); + return 0; +} + +static int mptcp_limit_get_set(int argc, char **argv, int cmd) +{ + bool do_get = cmd == MPTCP_PM_CMD_GET_LIMITS; + MPTCP_REQUEST(req, cmd, NLM_F_REQUEST); + struct nlmsghdr *answer; + int ret; + + ret = mptcp_parse_limit(argc, argv, &req.n); + if (ret < 0) + return -1; + + if (rtnl_talk(&genl_rth, &req.n, do_get ? &answer : NULL) < 0) + return -2; + + if (do_get) + return print_mptcp_limit(answer, stdout); + return 0; +} + +int do_mptcp(int argc, char **argv) +{ + if (argc == 0) + usage(); + + if (matches(*argv, "help") == 0) + usage(); + + if (genl_init_handle(&genl_rth, MPTCP_PM_NAME, &genl_family)) + exit(1); + + if (matches(*argv, "endpoint") == 0) { + NEXT_ARG_FWD(); + if (argc == 0) + return mptcp_addr_show(0, NULL); + + if (matches(*argv, "add") == 0) + return mptcp_addr_modify(argc-1, argv+1, + MPTCP_PM_CMD_ADD_ADDR); + if (matches(*argv, "delete") == 0) + return mptcp_addr_modify(argc-1, argv+1, + MPTCP_PM_CMD_DEL_ADDR); + if (matches(*argv, "show") == 0) + return mptcp_addr_show(argc-1, argv+1); + if (matches(*argv, "flush") == 0) + return mptcp_addr_flush(argc-1, argv+1); + + goto unknown; + } + + if (matches(*argv, "limits") == 0) { + NEXT_ARG_FWD(); + if (argc == 0) + return mptcp_limit_get_set(0, NULL, + MPTCP_PM_CMD_GET_LIMITS); + + if (matches(*argv, "set") == 0) + return mptcp_limit_get_set(argc-1, argv+1, + MPTCP_PM_CMD_SET_LIMITS); + if (matches(*argv, "show") == 0) + return mptcp_limit_get_set(argc-1, argv+1, + MPTCP_PM_CMD_GET_LIMITS); + } + +unknown: + fprintf(stderr, "Command \"%s\" is unknown, try \"ip mptcp help\".\n", + *argv); + exit(-1); +} From 712fdd98c0839540a50baca0fb858c7a72d18031 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 23 Apr 2020 15:37:09 +0200 Subject: [PATCH 06/33] ss: allow dumping MPTCP subflow information [root@f31 packetdrill]# ss -tni ESTAB 0 0 192.168.82.247:8080 192.0.2.1:35273 cubic wscale:7,8 [...] tcp-ulp-mptcp flags:Mec token:0000(id:0)/5f856c60(id:0) seq:b810457db34209a5 sfseq:1 ssnoff:0 maplen:190 Additionally extends ss manpage to describe the new entry layout. Signed-off-by: Davide Caratti Signed-off-by: Paolo Abeni Signed-off-by: David Ahern --- man/man8/ss.8 | 5 +++++ misc/ss.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/man/man8/ss.8 b/man/man8/ss.8 index 023d771b..c80853f9 100644 --- a/man/man8/ss.8 +++ b/man/man8/ss.8 @@ -261,6 +261,11 @@ the pacing rate and max pacing rate .TP .B rcv_space: a helper variable for TCP internal auto tuning socket receive buffer +.P +.TP +.B tcp-ulp-mptcp flags:[MmBbJjecv] token: seq: sfseq: ssnoff: maplen: +MPTCP subflow information +.P .RE .TP .B \-\-tos diff --git a/misc/ss.c b/misc/ss.c index 3ef151fb..ee840149 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -53,6 +53,7 @@ #include #include #include +#include /* AF_VSOCK/PF_VSOCK is only provided since glibc 2.18 */ #ifndef PF_VSOCK @@ -2836,6 +2837,59 @@ static void tcp_tls_conf(const char *name, struct rtattr *attr) } } +static void mptcp_subflow_info(struct rtattr *tb[]) +{ + u_int32_t flags = 0; + + if (tb[MPTCP_SUBFLOW_ATTR_FLAGS]) { + char caps[32 + 1] = { 0 }, *cap = &caps[0]; + + flags = rta_getattr_u32(tb[MPTCP_SUBFLOW_ATTR_FLAGS]); + + if (flags & MPTCP_SUBFLOW_FLAG_MCAP_REM) + *cap++ = 'M'; + if (flags & MPTCP_SUBFLOW_FLAG_MCAP_LOC) + *cap++ = 'm'; + if (flags & MPTCP_SUBFLOW_FLAG_JOIN_REM) + *cap++ = 'J'; + if (flags & MPTCP_SUBFLOW_FLAG_JOIN_LOC) + *cap++ = 'j'; + if (flags & MPTCP_SUBFLOW_FLAG_BKUP_REM) + *cap++ = 'B'; + if (flags & MPTCP_SUBFLOW_FLAG_BKUP_LOC) + *cap++ = 'b'; + if (flags & MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED) + *cap++ = 'e'; + if (flags & MPTCP_SUBFLOW_FLAG_CONNECTED) + *cap++ = 'c'; + if (flags & MPTCP_SUBFLOW_FLAG_MAPVALID) + *cap++ = 'v'; + if (flags) + out(" flags:%s", caps); + } + if (tb[MPTCP_SUBFLOW_ATTR_TOKEN_REM] && + tb[MPTCP_SUBFLOW_ATTR_TOKEN_LOC] && + tb[MPTCP_SUBFLOW_ATTR_ID_REM] && + tb[MPTCP_SUBFLOW_ATTR_ID_LOC]) + out(" token:%04x(id:%hhu)/%04x(id:%hhu)", + rta_getattr_u32(tb[MPTCP_SUBFLOW_ATTR_TOKEN_REM]), + rta_getattr_u8(tb[MPTCP_SUBFLOW_ATTR_ID_REM]), + rta_getattr_u32(tb[MPTCP_SUBFLOW_ATTR_TOKEN_LOC]), + rta_getattr_u8(tb[MPTCP_SUBFLOW_ATTR_ID_LOC])); + if (tb[MPTCP_SUBFLOW_ATTR_MAP_SEQ]) + out(" seq:%llx", + rta_getattr_u64(tb[MPTCP_SUBFLOW_ATTR_MAP_SEQ])); + if (tb[MPTCP_SUBFLOW_ATTR_MAP_SFSEQ]) + out(" sfseq:%x", + rta_getattr_u32(tb[MPTCP_SUBFLOW_ATTR_MAP_SFSEQ])); + if (tb[MPTCP_SUBFLOW_ATTR_SSN_OFFSET]) + out(" ssnoff:%x", + rta_getattr_u32(tb[MPTCP_SUBFLOW_ATTR_SSN_OFFSET])); + if (tb[MPTCP_SUBFLOW_ATTR_MAP_DATALEN]) + out(" maplen:%x", + rta_getattr_u32(tb[MPTCP_SUBFLOW_ATTR_MAP_DATALEN])); +} + #define TCPI_HAS_OPT(info, opt) !!(info->tcpi_options & (opt)) static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r, @@ -3012,6 +3066,14 @@ static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r, tcp_tls_conf("rxconf", tlsinfo[TLS_INFO_RXCONF]); tcp_tls_conf("txconf", tlsinfo[TLS_INFO_TXCONF]); } + if (ulpinfo[INET_ULP_INFO_MPTCP]) { + struct rtattr *sfinfo[MPTCP_SUBFLOW_ATTR_MAX + 1] = + { 0 }; + + parse_rtattr_nested(sfinfo, MPTCP_SUBFLOW_ATTR_MAX, + ulpinfo[INET_ULP_INFO_MPTCP]); + mptcp_subflow_info(sfinfo); + } } } From 2d8b5fe93e9decb56acc243905d82fb22d6c4cfd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 23 Apr 2020 15:37:10 +0200 Subject: [PATCH 07/33] man: mptcp man page describe the mptcp subcommands implemented so far. Signed-off-by: Paolo Abeni Signed-off-by: David Ahern --- man/man8/ip-mptcp.8 | 142 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 man/man8/ip-mptcp.8 diff --git a/man/man8/ip-mptcp.8 b/man/man8/ip-mptcp.8 new file mode 100644 index 00000000..f6457e97 --- /dev/null +++ b/man/man8/ip-mptcp.8 @@ -0,0 +1,142 @@ +.TH IP\-MPTCP 8 "4 Apr 2020" "iproute2" "Linux" +.SH "NAME" +ip-mptcp \- MPTCP path manager configuration +.SH "SYNOPSIS" +.sp +.ad l +.in +8 +.ti -8 +.B ip +.RI "[ " OPTIONS " ]" +.B mptcp +.RB "{ " +.B endpoint +.RB " | " +.B limits +.RB " | " +.B help +.RB " }" +.sp + +.ti -8 +.BR "ip mptcp endpoint add " +.IR IFADDR +.RB "[ " dev +.IR IFNAME " ]" +.RB "[ " id +.I ID +.RB "] [ " +.I FLAG-LIST +.RB "] " + +.ti -8 +.BR "ip mptcp endpoint del id " +.I ID + +.ti -8 +.BR "ip mptcp endpoint show " +.RB "[ " id +.I ID +.RB "]" + +.ti -8 +.BR "ip mptcp endpoint flush" + +.ti -8 +.IR FLAG-LIST " := [ " FLAG-LIST " ] " FLAG + +.ti -8 +.IR FLAG " := [" +.B signal +.RB "|" +.B subflow +.RB "|" +.B backup +.RB "]" + +.ti -8 +.BR "ip mptcp limits set " +.RB "[ " +.B subflow +.IR SUBFLOW_NR " ]" +.RB "[ " +.B add_addr_accepted +.IR ADD_ADDR_ACCEPTED_NR " ]" + +.ti -8 +.BR "ip mptcp limits show" + +.SH DESCRIPTION + +MPTCP is a transport protocol built on top of TCP that allows TCP +connections to use multiple paths to maximize resource usage and increase +redundancy. The ip-mptcp sub-commands allow configuring several aspects of the +MPTCP path manager, which is in charge of subflows creation: + +.P +The +.B endpoint +object specifies the IP addresses that will be used and/or announced for +additional subflows: + +.TS +l l. +ip mptcp endpoint add add new MPTCP endpoint +ip mptcp endpoint delete delete existing MPTCP endpoint +ip mptcp endpoint show get existing MPTCP endpoint +ip mptcp endpoint flush flush all existing MPTCP endpoints +.TE + +.TP +.IR ID +is a unique numeric identifier for the given endpoint + +.TP +.BR signal +the endpoint will be announced/signalled to each peer via an ADD_ADDR MPTCP +sub-option + +.TP +.BR subflow +if additional subflow creation is allowed by MPTCP limits, the endpoint will +be used as the source address to create an additional subflow after that +the MPTCP connection is established. + +.TP +.BR backup +the endpoint will be announced as a backup address, if this is a +.BR signal +endpoint, or the subflow will be created as a backup one if this is a +.BR subflow +endpoint + +.sp +.PP +The +.B limits +object specifies the constraints for subflow creations: + +.TS +l l. +ip mptcp limits show get current MPTCP subflow creation limits +ip mptcp limits set change the MPTCP subflow creation limits +.TE + +.TP +.IR SUBFLOW_NR +specifies the maximum number of additional subflows allowed for each MPTCP +connection. Additional subflows can be created due to: incoming accepted +ADD_ADDR option, local +.BR subflow +endpoints, additional subflows started by the peer. + +.TP +.IR ADD_ADDR_ACCEPTED_NR +specifies the maximum number of ADD_ADDR suboptions accepted for each MPTCP +connection. The MPTCP path manager will try to create a new subflow for +each accepted ADD_ADDR option, respecting the +.IR SUBFLOW_NR +limit. + +.SH AUTHOR +Original Manpage by Paolo Abeni From 0c42c6b130196d1d7e87acc5122f8fd325e75c5b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 29 Apr 2020 19:17:22 +0200 Subject: [PATCH 08/33] man: ip.8: add reference to mptcp man-page While at it, additionally fix a mandoc warning in mptcp.8 Signed-off-by: Paolo Abeni Signed-off-by: David Ahern --- man/man8/ip-mptcp.8 | 1 - man/man8/ip.8 | 7 ++++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/man/man8/ip-mptcp.8 b/man/man8/ip-mptcp.8 index f6457e97..ef8409ea 100644 --- a/man/man8/ip-mptcp.8 +++ b/man/man8/ip-mptcp.8 @@ -2,7 +2,6 @@ .SH "NAME" ip-mptcp \- MPTCP path manager configuration .SH "SYNOPSIS" -.sp .ad l .in +8 .ti -8 diff --git a/man/man8/ip.8 b/man/man8/ip.8 index 1613f790..c9f7671e 100644 --- a/man/man8/ip.8 +++ b/man/man8/ip.8 @@ -22,7 +22,7 @@ ip \- show / manipulate routing, network devices, interfaces and tunnels .BR link " | " address " | " addrlabel " | " route " | " rule " | " neigh " | "\ ntable " | " tunnel " | " tuntap " | " maddress " | " mroute " | " mrule " | "\ monitor " | " xfrm " | " netns " | " l2tp " | " tcp_metrics " | " token " | "\ - macsec " | " vrf " }" + macsec " | " vrf " | " mptcp " }" .sp .ti -8 @@ -268,6 +268,10 @@ readability. .B monitor - watch for netlink messages. +.TP +.B mptcp +- manage MPTCP path manager. + .TP .B mroute - multicast routing cache entry. @@ -405,6 +409,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2. .BR ip-link (8), .BR ip-maddress (8), .BR ip-monitor (8), +.BR ip-mptcp (8), .BR ip-mroute (8), .BR ip-neighbour (8), .BR ip-netns (8), From 846b6b2da8358d34aa4be8b310f90195e134b5b0 Mon Sep 17 00:00:00 2001 From: William Tu Date: Sun, 26 Apr 2020 08:04:15 -0700 Subject: [PATCH 09/33] erspan: Add type I version 0 support. The Type I ERSPAN frame format is based on the barebones IP + GRE(4-byte) encapsulation on top of the raw mirrored frame. Both type I and II use 0x88BE as protocol type. Unlike type II and III, no sequence number or key is required. To creat a type I erspan tunnel device: $ ip link add dev erspan11 type erspan \ local 172.16.1.100 remote 172.16.1.200 \ erspan_ver 0 CC: Dmitriy Andreyevskiy Signed-off-by: William Tu Signed-off-by: David Ahern --- ip/link_gre.c | 4 ++-- ip/link_gre6.c | 6 +++--- man/man8/ip-link.8.in | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ip/link_gre.c b/ip/link_gre.c index d616a970..0461e5d0 100644 --- a/ip/link_gre.c +++ b/ip/link_gre.c @@ -354,8 +354,8 @@ get_failed: NEXT_ARG(); if (get_u8(&erspan_ver, *argv, 0)) invarg("invalid erspan version\n", *argv); - if (erspan_ver != 1 && erspan_ver != 2) - invarg("erspan version must be 1 or 2\n", *argv); + if (erspan_ver > 2) + invarg("erspan version must be 0/1/2\n", *argv); } else if (is_erspan && strcmp(*argv, "erspan_dir") == 0) { NEXT_ARG(); if (matches(*argv, "ingress") == 0) diff --git a/ip/link_gre6.c b/ip/link_gre6.c index 94a4ee70..9d270f4b 100644 --- a/ip/link_gre6.c +++ b/ip/link_gre6.c @@ -389,8 +389,8 @@ get_failed: NEXT_ARG(); if (get_u8(&erspan_ver, *argv, 0)) invarg("invalid erspan version\n", *argv); - if (erspan_ver != 1 && erspan_ver != 2) - invarg("erspan version must be 1 or 2\n", *argv); + if (erspan_ver > 2) + invarg("erspan version must be 0/1/2\n", *argv); } else if (strcmp(*argv, "erspan_dir") == 0) { NEXT_ARG(); if (matches(*argv, "ingress") == 0) @@ -430,7 +430,7 @@ get_failed: addattr_l(n, 1024, IFLA_GRE_FLOWINFO, &flowinfo, 4); addattr32(n, 1024, IFLA_GRE_FLAGS, flags); addattr32(n, 1024, IFLA_GRE_FWMARK, fwmark); - if (erspan_ver) { + if (erspan_ver <= 2) { addattr8(n, 1024, IFLA_GRE_ERSPAN_VER, erspan_ver); if (erspan_ver == 1 && erspan_idx != 0) { addattr32(n, 1024, IFLA_GRE_ERSPAN_INDEX, erspan_idx); diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 939e2ad4..e8a25451 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -1163,8 +1163,8 @@ It must be an address on another interface on this host. .BR erspan_ver " \fIversion " - specifies the ERSPAN version number. .IR version -indicates the ERSPAN version to be created: 1 for version 1 (type II) -or 2 for version 2 (type III). +indicates the ERSPAN version to be created: 0 for version 0 type I, +1 for version 1 (type II) or 2 for version 2 (type III). .sp .BR erspan " \fIIDX " From 081d6c310d3a6e0412431a9652f641dff3f72aee Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 28 Apr 2020 14:44:33 +0300 Subject: [PATCH 10/33] tc: pedit: Support JSON dumping The action pedit does not currently support dumping to JSON. Convert print_pedit() to the print_* family of functions so that dumping is correct both in plain and JSON mode. In plain mode, the output is character for character the same as it was before. In JSON mode, this is an example dump: $ tc filter add dev dummy0 ingress prio 125 flower \ action pedit ex munge udp dport set 12345 \ munge ip ttl add 1 \ munge offset 10 u8 clear $ tc -j filter show dev dummy0 ingress | jq [ { "protocol": "all", "pref": 125, "kind": "flower", "chain": 0 }, { "protocol": "all", "pref": 125, "kind": "flower", "chain": 0, "options": { "handle": 1, "keys": {}, "not_in_hw": true, "actions": [ { "order": 1, "kind": "pedit", "control_action": { "type": "pass" }, "nkeys": 3, "index": 1, "ref": 1, "bind": 1, "keys": [ { "htype": "udp", "offset": 0, "cmd": "set", "val": "3039", "mask": "ffff0000" }, { "htype": "ipv4", "offset": 8, "cmd": "add", "val": "1000000", "mask": "ffffff" }, { "htype": "network", "offset": 8, "cmd": "set", "val": "0", "mask": "ffff00ff" } ] } ] } } ] Signed-off-by: Petr Machata Signed-off-by: David Ahern --- tc/m_pedit.c | 67 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/tc/m_pedit.c b/tc/m_pedit.c index fccfd17c..51dcf109 100644 --- a/tc/m_pedit.c +++ b/tc/m_pedit.c @@ -714,20 +714,28 @@ static const char * const pedit_htype_str[] = { [TCA_PEDIT_KEY_EX_HDR_TYPE_UDP] = "udp", }; -static void print_pedit_location(FILE *f, - enum pedit_header_type htype, __u32 off) +static int print_pedit_location(FILE *f, + enum pedit_header_type htype, __u32 off) { - if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK) { - fprintf(f, "%d", (unsigned int)off); - return; + char *buf = NULL; + int rc; + + if (htype != TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK) { + if (htype < ARRAY_SIZE(pedit_htype_str)) + rc = asprintf(&buf, "%s", pedit_htype_str[htype]); + else + rc = asprintf(&buf, "unknown(%d)", htype); + if (rc < 0) + return rc; + print_string(PRINT_ANY, "htype", "%s", buf); + print_int(PRINT_ANY, "offset", "%+d", off); + } else { + print_string(PRINT_JSON, "htype", NULL, "network"); + print_int(PRINT_ANY, "offset", "%d", off); } - if (htype < ARRAY_SIZE(pedit_htype_str)) - fprintf(f, "%s", pedit_htype_str[htype]); - else - fprintf(f, "unknown(%d)", htype); - - fprintf(f, "%c%d", (int)off >= 0 ? '+' : '-', abs((int)off)); + free(buf); + return 0; } static int print_pedit(struct action_util *au, FILE *f, struct rtattr *arg) @@ -735,6 +743,7 @@ static int print_pedit(struct action_util *au, FILE *f, struct rtattr *arg) struct tc_pedit_sel *sel; struct rtattr *tb[TCA_PEDIT_MAX + 1]; struct m_pedit_key_ex *keys_ex = NULL; + int err; if (arg == NULL) return -1; @@ -774,11 +783,12 @@ static int print_pedit(struct action_util *au, FILE *f, struct rtattr *arg) } } - fprintf(f, " pedit "); + print_string(PRINT_ANY, "kind", " %s ", "pedit"); print_action_control(f, "action ", sel->action, " "); - fprintf(f,"keys %d\n ", sel->nkeys); - fprintf(f, "\t index %u ref %d bind %d", sel->index, sel->refcnt, - sel->bindcnt); + print_uint(PRINT_ANY, "nkeys", "keys %d\n", sel->nkeys); + print_uint(PRINT_ANY, "index", " \t index %u", sel->index); + print_int(PRINT_ANY, "ref", " ref %d", sel->refcnt); + print_int(PRINT_ANY, "bind", " bind %d", sel->bindcnt); if (show_stats) { if (tb[TCA_PEDIT_TM]) { @@ -787,6 +797,7 @@ static int print_pedit(struct action_util *au, FILE *f, struct rtattr *arg) print_tm(f, tm); } } + open_json_array(PRINT_JSON, "keys"); if (sel->nkeys) { int i; struct tc_pedit_key *key = sel->keys; @@ -804,21 +815,31 @@ static int print_pedit(struct action_util *au, FILE *f, struct rtattr *arg) key_ex++; } - fprintf(f, "\n\t key #%d", i); + open_json_object(NULL); + print_uint(PRINT_FP, NULL, "\n\t key #%d at ", i); - fprintf(f, " at "); + err = print_pedit_location(f, htype, key->off); + if (err) + return err; - print_pedit_location(f, htype, key->off); - - fprintf(f, ": %s %08x mask %08x", - cmd ? "add" : "val", - (unsigned int)ntohl(key->val), - (unsigned int)ntohl(key->mask)); + /* In FP, report the "set" command as "val" to keep + * backward compatibility. Report the true name in JSON. + */ + print_string(PRINT_FP, NULL, ": %s", + cmd ? "add" : "val"); + print_string(PRINT_JSON, "cmd", NULL, + cmd ? "add" : "set"); + print_hex(PRINT_ANY, "val", " %08x", + (unsigned int)ntohl(key->val)); + print_hex(PRINT_ANY, "mask", " mask %08x", + (unsigned int)ntohl(key->mask)); + close_json_object(); } } else { fprintf(f, "\npedit %x keys %d is not LEGIT", sel->index, sel->nkeys); } + close_json_array(PRINT_JSON, " "); print_nl(); From ca7614d4c6f456187d831a8202bb4a8559a72f8b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 27 Apr 2020 18:27:45 +0800 Subject: [PATCH 11/33] iproute_lwtunnel: add options support for geneve metadata This patch is to add LWTUNNEL_IP(6)_OPTS and LWTUNNEL_IP_OPTS_GENEVE's parse and print to implement geneve options support in iproute_lwtunnel. Options are expressed as class:type:data and multiple options may be listed using a comma delimiter, class and type are numbers and data is a hex string. With this patch, users can add and dump geneve options like: # ip netns add a # ip netns add b # ip -n a link add eth0 type veth peer name eth0 netns b # ip -n a link set eth0 up; ip -n b link set eth0 up # ip -n a addr add 10.1.0.1/24 dev eth0 # ip -n b addr add 10.1.0.2/24 dev eth0 # ip -n b link add geneve1 type geneve id 1 remote 10.1.0.1 ttl 64 # ip -n b addr add 1.1.1.1/24 dev geneve1 # ip -n b link set geneve1 up # ip -n b route add 2.1.1.0/24 dev geneve1 # ip -n a link add geneve1 type geneve external # ip -n a addr add 2.1.1.1/24 dev geneve1 # ip -n a link set geneve1 up # ip -n a route add 1.1.1.0/24 encap ip id 1 geneve_opts \ 1:1:1212121234567890,1:1:1212121234567890,1:1:1212121234567890 \ dst 10.1.0.2 dev geneve1 # ip -n a route show # ip netns exec a ping 1.1.1.1 -c 1 1.1.1.0/24 encap ip id 1 src 0.0.0.0 dst 10.1.0.2 ttl 0 tos 0 geneve_opts 1:1:1212121234567890,1:1:1212121234567890 ... PING 1.1.1.1 (1.1.1.1) 56(84) bytes of data. 64 bytes from 1.1.1.1: icmp_seq=1 ttl=64 time=0.079 ms v1->v2: - improve the changelog. - use PRINT_ANY to support dumping with json format. v2->v3: - implement proper JSON array for opts instead of just bunch of strings. v3->v4: - keep the same format between input and output, json and non json. - print class and type as uint and print data as hex string. Signed-off-by: Xin Long Signed-off-by: David Ahern --- ip/iproute_lwtunnel.c | 174 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 172 insertions(+), 2 deletions(-) diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c index 0d7d7149..85998536 100644 --- a/ip/iproute_lwtunnel.c +++ b/ip/iproute_lwtunnel.c @@ -294,6 +294,54 @@ static void print_encap_mpls(FILE *fp, struct rtattr *encap) rta_getattr_u8(tb[MPLS_IPTUNNEL_TTL])); } +static void lwtunnel_print_geneve_opts(struct rtattr *attr) +{ + struct rtattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1]; + struct rtattr *i = RTA_DATA(attr); + int rem = RTA_PAYLOAD(attr); + char *name = "geneve_opts"; + int data_len, offset = 0; + char data[rem * 2 + 1]; + __u16 class; + __u8 type; + + print_nl(); + print_string(PRINT_FP, name, "\t%s ", name); + open_json_array(PRINT_JSON, name); + + while (rem) { + parse_rtattr(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, i, rem); + class = rta_getattr_be16(tb[LWTUNNEL_IP_OPT_GENEVE_CLASS]); + type = rta_getattr_u8(tb[LWTUNNEL_IP_OPT_GENEVE_TYPE]); + data_len = RTA_PAYLOAD(tb[LWTUNNEL_IP_OPT_GENEVE_DATA]); + hexstring_n2a(RTA_DATA(tb[LWTUNNEL_IP_OPT_GENEVE_DATA]), + data_len, data, sizeof(data)); + offset += data_len + 20; + rem -= data_len + 20; + i = RTA_DATA(attr) + offset; + + open_json_object(NULL); + print_uint(PRINT_ANY, "class", "%u", class); + print_uint(PRINT_ANY, "type", ":%u", type); + if (rem) + print_string(PRINT_ANY, "data", ":%s,", data); + else + print_string(PRINT_ANY, "data", ":%s ", data); + close_json_object(); + } + + close_json_array(PRINT_JSON, name); +} + +static void lwtunnel_print_opts(struct rtattr *attr) +{ + struct rtattr *tb_opt[LWTUNNEL_IP_OPTS_MAX + 1]; + + parse_rtattr_nested(tb_opt, LWTUNNEL_IP_OPTS_MAX, attr); + if (tb_opt[LWTUNNEL_IP_OPTS_GENEVE]) + lwtunnel_print_geneve_opts(tb_opt[LWTUNNEL_IP_OPTS_GENEVE]); +} + static void print_encap_ip(FILE *fp, struct rtattr *encap) { struct rtattr *tb[LWTUNNEL_IP_MAX+1]; @@ -332,6 +380,9 @@ static void print_encap_ip(FILE *fp, struct rtattr *encap) if (flags & TUNNEL_SEQ) print_bool(PRINT_ANY, "seq", "seq ", true); } + + if (tb[LWTUNNEL_IP_OPTS]) + lwtunnel_print_opts(tb[LWTUNNEL_IP_OPTS]); } static void print_encap_ila(FILE *fp, struct rtattr *encap) @@ -404,6 +455,9 @@ static void print_encap_ip6(FILE *fp, struct rtattr *encap) if (flags & TUNNEL_SEQ) print_bool(PRINT_ANY, "seq", "seq ", true); } + + if (tb[LWTUNNEL_IP6_OPTS]) + lwtunnel_print_opts(tb[LWTUNNEL_IP6_OPTS]); } static void print_encap_bpf(FILE *fp, struct rtattr *encap) @@ -798,11 +852,97 @@ static int parse_encap_mpls(struct rtattr *rta, size_t len, return 0; } +static int lwtunnel_parse_geneve_opt(char *str, size_t len, struct rtattr *rta) +{ + struct rtattr *nest; + char *token; + int i, err; + + nest = rta_nest(rta, len, LWTUNNEL_IP_OPTS_GENEVE | NLA_F_NESTED); + i = 1; + token = strsep(&str, ":"); + while (token) { + switch (i) { + case LWTUNNEL_IP_OPT_GENEVE_CLASS: + { + __be16 opt_class; + + if (!strlen(token)) + break; + err = get_be16(&opt_class, token, 0); + if (err) + return err; + + rta_addattr16(rta, len, i, opt_class); + break; + } + case LWTUNNEL_IP_OPT_GENEVE_TYPE: + { + __u8 opt_type; + + if (!strlen(token)) + break; + err = get_u8(&opt_type, token, 0); + if (err) + return err; + + rta_addattr8(rta, len, i, opt_type); + break; + } + case LWTUNNEL_IP_OPT_GENEVE_DATA: + { + size_t token_len = strlen(token); + __u8 *opts; + + if (!token_len) + break; + opts = malloc(token_len / 2); + if (!opts) + return -1; + if (hex2mem(token, opts, token_len / 2) < 0) { + free(opts); + return -1; + } + rta_addattr_l(rta, len, i, opts, token_len / 2); + free(opts); + + break; + } + default: + fprintf(stderr, "Unknown \"geneve_opts\" type\n"); + return -1; + } + + token = strsep(&str, ":"); + i++; + } + rta_nest_end(rta, nest); + + return 0; +} + +static int lwtunnel_parse_geneve_opts(char *str, size_t len, struct rtattr *rta) +{ + char *token; + int err; + + token = strsep(&str, ","); + while (token) { + err = lwtunnel_parse_geneve_opt(token, len, rta); + if (err) + return err; + + token = strsep(&str, ","); + } + + return 0; +} + static int parse_encap_ip(struct rtattr *rta, size_t len, int *argcp, char ***argvp) { int id_ok = 0, dst_ok = 0, src_ok = 0, tos_ok = 0, ttl_ok = 0; - int key_ok = 0, csum_ok = 0, seq_ok = 0; + int key_ok = 0, csum_ok = 0, seq_ok = 0, opts_ok = 0; char **argv = *argvp; int argc = *argcp; int ret = 0; @@ -854,6 +994,21 @@ static int parse_encap_ip(struct rtattr *rta, size_t len, if (get_u8(&ttl, *argv, 0)) invarg("\"ttl\" value is invalid\n", *argv); ret = rta_addattr8(rta, len, LWTUNNEL_IP_TTL, ttl); + } else if (strcmp(*argv, "geneve_opts") == 0) { + struct rtattr *nest; + + if (opts_ok++) + duparg2("opts", *argv); + + NEXT_ARG(); + + nest = rta_nest(rta, len, + LWTUNNEL_IP_OPTS | NLA_F_NESTED); + ret = lwtunnel_parse_geneve_opts(*argv, len, rta); + if (ret) + invarg("\"geneve_opts\" value is invalid\n", + *argv); + rta_nest_end(rta, nest); } else if (strcmp(*argv, "key") == 0) { if (key_ok++) duparg2("key", *argv); @@ -969,7 +1124,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len, int *argcp, char ***argvp) { int id_ok = 0, dst_ok = 0, src_ok = 0, tos_ok = 0, ttl_ok = 0; - int key_ok = 0, csum_ok = 0, seq_ok = 0; + int key_ok = 0, csum_ok = 0, seq_ok = 0, opts_ok = 0; char **argv = *argvp; int argc = *argcp; int ret = 0; @@ -1023,6 +1178,21 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len, *argv); ret = rta_addattr8(rta, len, LWTUNNEL_IP6_HOPLIMIT, hoplimit); + } else if (strcmp(*argv, "geneve_opts") == 0) { + struct rtattr *nest; + + if (opts_ok++) + duparg2("opts", *argv); + + NEXT_ARG(); + + nest = rta_nest(rta, len, + LWTUNNEL_IP_OPTS | NLA_F_NESTED); + ret = lwtunnel_parse_geneve_opts(*argv, len, rta); + if (ret) + invarg("\"geneve_opts\" value is invalid\n", + *argv); + rta_nest_end(rta, nest); } else if (strcmp(*argv, "key") == 0) { if (key_ok++) duparg2("key", *argv); From b1bc0f38922220b379ed39552a5e2a7cf9dccd92 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 27 Apr 2020 18:27:46 +0800 Subject: [PATCH 12/33] iproute_lwtunnel: add options support for vxlan metadata This patch is to add LWTUNNEL_IP_OPTS_VXLAN's parse and print to implement vxlan options support in iproute_lwtunnel. Option is expressed a number for gbp only, and vxlan doesn't support multiple options. With this patch, users can add and dump vxlan options like: # ip netns add a # ip netns add b # ip -n a link add eth0 type veth peer name eth0 netns b # ip -n a link set eth0 up # ip -n b link set eth0 up # ip -n a addr add 10.1.0.1/24 dev eth0 # ip -n b addr add 10.1.0.2/24 dev eth0 # ip -n b link add vxlan1 type vxlan id 1 local 10.1.0.2 \ remote 10.1.0.1 dev eth0 ttl 64 gbp # ip -n b addr add 1.1.1.1/24 dev vxlan1 # ip -n b link set vxlan1 up # ip -n b route add 2.1.1.0/24 dev vxlan1 # ip -n a link add vxlan1 type vxlan local 10.1.0.1 dev eth0 ttl 64 \ gbp external # ip -n a addr add 2.1.1.1/24 dev vxlan1 # ip -n a link set vxlan1 up # ip -n a route add 1.1.1.0/24 encap ip id 1 \ vxlan_opts 1110 dst 10.1.0.2 dev vxlan1 # ip -n a route show # ip netns exec a ping 1.1.1.1 -c 1 1.1.1.0/24 encap ip id 1 src 0.0.0.0 dst 10.1.0.2 ttl 0 tos 0 vxlan_opts 1110 dev vxlan1 scope link PING 1.1.1.1 (1.1.1.1) 56(84) bytes of data. 64 bytes from 1.1.1.1: icmp_seq=1 ttl=64 time=0.111 ms v1->v2: - improve the changelog. - get_u32 with base = 0 for gbp. - use PRINT_ANY to support dumping with json format. v2->v3: - implement proper JSON array for opts. v3->v4: - keep the same format between input and output, json and non json. - print gbp as uint. Signed-off-by: Xin Long Signed-off-by: David Ahern --- ip/iproute_lwtunnel.c | 68 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c index 85998536..bbd0ace7 100644 --- a/ip/iproute_lwtunnel.c +++ b/ip/iproute_lwtunnel.c @@ -333,6 +333,26 @@ static void lwtunnel_print_geneve_opts(struct rtattr *attr) close_json_array(PRINT_JSON, name); } +static void lwtunnel_print_vxlan_opts(struct rtattr *attr) +{ + struct rtattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1]; + struct rtattr *i = RTA_DATA(attr); + int rem = RTA_PAYLOAD(attr); + char *name = "vxlan_opts"; + __u32 gbp; + + parse_rtattr(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, i, rem); + gbp = rta_getattr_u32(tb[LWTUNNEL_IP_OPT_VXLAN_GBP]); + + print_nl(); + print_string(PRINT_FP, name, "\t%s ", name); + open_json_array(PRINT_JSON, name); + open_json_object(NULL); + print_uint(PRINT_ANY, "gbp", "%u ", gbp); + close_json_object(); + close_json_array(PRINT_JSON, name); +} + static void lwtunnel_print_opts(struct rtattr *attr) { struct rtattr *tb_opt[LWTUNNEL_IP_OPTS_MAX + 1]; @@ -340,6 +360,8 @@ static void lwtunnel_print_opts(struct rtattr *attr) parse_rtattr_nested(tb_opt, LWTUNNEL_IP_OPTS_MAX, attr); if (tb_opt[LWTUNNEL_IP_OPTS_GENEVE]) lwtunnel_print_geneve_opts(tb_opt[LWTUNNEL_IP_OPTS_GENEVE]); + else if (tb_opt[LWTUNNEL_IP_OPTS_VXLAN]) + lwtunnel_print_vxlan_opts(tb_opt[LWTUNNEL_IP_OPTS_VXLAN]); } static void print_encap_ip(FILE *fp, struct rtattr *encap) @@ -938,6 +960,22 @@ static int lwtunnel_parse_geneve_opts(char *str, size_t len, struct rtattr *rta) return 0; } +static int lwtunnel_parse_vxlan_opts(char *str, size_t len, struct rtattr *rta) +{ + struct rtattr *nest; + __u32 gbp; + int err; + + nest = rta_nest(rta, len, LWTUNNEL_IP_OPTS_VXLAN | NLA_F_NESTED); + err = get_u32(&gbp, str, 0); + if (err) + return err; + rta_addattr32(rta, len, LWTUNNEL_IP_OPT_VXLAN_GBP, gbp); + + rta_nest_end(rta, nest); + return 0; +} + static int parse_encap_ip(struct rtattr *rta, size_t len, int *argcp, char ***argvp) { @@ -1009,6 +1047,21 @@ static int parse_encap_ip(struct rtattr *rta, size_t len, invarg("\"geneve_opts\" value is invalid\n", *argv); rta_nest_end(rta, nest); + } else if (strcmp(*argv, "vxlan_opts") == 0) { + struct rtattr *nest; + + if (opts_ok++) + duparg2("opts", *argv); + + NEXT_ARG(); + + nest = rta_nest(rta, len, + LWTUNNEL_IP_OPTS | NLA_F_NESTED); + ret = lwtunnel_parse_vxlan_opts(*argv, len, rta); + if (ret) + invarg("\"vxlan_opts\" value is invalid\n", + *argv); + rta_nest_end(rta, nest); } else if (strcmp(*argv, "key") == 0) { if (key_ok++) duparg2("key", *argv); @@ -1193,6 +1246,21 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len, invarg("\"geneve_opts\" value is invalid\n", *argv); rta_nest_end(rta, nest); + } else if (strcmp(*argv, "vxlan_opts") == 0) { + struct rtattr *nest; + + if (opts_ok++) + duparg2("opts", *argv); + + NEXT_ARG(); + + nest = rta_nest(rta, len, + LWTUNNEL_IP_OPTS | NLA_F_NESTED); + ret = lwtunnel_parse_vxlan_opts(*argv, len, rta); + if (ret) + invarg("\"vxlan_opts\" value is invalid\n", + *argv); + rta_nest_end(rta, nest); } else if (strcmp(*argv, "key") == 0) { if (key_ok++) duparg2("key", *argv); From 39fa047938fbef6cd08687b0daa4d86afbfdc61c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 27 Apr 2020 18:27:47 +0800 Subject: [PATCH 13/33] iproute_lwtunnel: add options support for erspan metadata This patch is to add LWTUNNEL_IP_OPTS_ERSPAN's parse and print to implement erspan options support in iproute_lwtunnel. Option is expressed as version:index:dir:hwid, dir and hwid will be parsed when version is 2, while index will be parsed when version is 1. All of these are numbers. erspan doesn't support multiple options. With this patch, users can add and dump erspan options like: # ip netns add a # ip netns add b # ip -n a link add eth0 type veth peer name eth0 netns b # ip -n a link set eth0 up # ip -n b link set eth0 up # ip -n a addr add 10.1.0.1/24 dev eth0 # ip -n b addr add 10.1.0.2/24 dev eth0 # ip -n b link add erspan1 type erspan key 1 seq erspan 123 \ local 10.1.0.2 remote 10.1.0.1 # ip -n b addr add 1.1.1.1/24 dev erspan1 # ip -n b link set erspan1 up # ip -n b route add 2.1.1.0/24 dev erspan1 # ip -n a link add erspan1 type erspan key 1 seq local 10.1.0.1 external # ip -n a addr add 2.1.1.1/24 dev erspan1 # ip -n a link set erspan1 up # ip -n a route add 1.1.1.0/24 encap ip id 1 \ erspan_opts 2:123:1:2 dst 10.1.0.2 dev erspan1 # ip -n a route show # ip netns exec a ping 1.1.1.1 -c 1 1.1.1.0/24 encap ip id 1 src 0.0.0.0 dst 10.1.0.2 ttl 0 tos 0 erspan_opts 2:0:1:2 dev erspan1 scope link PING 1.1.1.1 (1.1.1.1) 56(84) bytes of data. 64 bytes from 1.1.1.1: icmp_seq=1 ttl=64 time=0.124 ms v1->v2: - improve the changelog. - use PRINT_ANY to support dumping with json format. v2->v3: - implement proper JSON object for opts instead of just bunch of strings. v3->v4: - keep the same format between input and output, json and non json. - print version, index, dir and hwid as uint. Signed-off-by: Xin Long Signed-off-by: David Ahern --- ip/iproute_lwtunnel.c | 140 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c index bbd0ace7..ff7c9d7f 100644 --- a/ip/iproute_lwtunnel.c +++ b/ip/iproute_lwtunnel.c @@ -353,6 +353,38 @@ static void lwtunnel_print_vxlan_opts(struct rtattr *attr) close_json_array(PRINT_JSON, name); } +static void lwtunnel_print_erspan_opts(struct rtattr *attr) +{ + struct rtattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1]; + struct rtattr *i = RTA_DATA(attr); + char *name = "erspan_opts"; + __u8 ver, hwid, dir; + __u32 idx; + + parse_rtattr(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, i, RTA_PAYLOAD(attr)); + ver = rta_getattr_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]); + if (ver == 1) { + idx = rta_getattr_be32(tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]); + dir = 0; + hwid = 0; + } else { + idx = 0; + dir = rta_getattr_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_DIR]); + hwid = rta_getattr_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]); + } + + print_nl(); + print_string(PRINT_FP, name, "\t%s ", name); + open_json_array(PRINT_JSON, name); + open_json_object(NULL); + print_uint(PRINT_ANY, "ver", "%u", ver); + print_uint(PRINT_ANY, "index", ":%u", idx); + print_uint(PRINT_ANY, "dir", ":%u", dir); + print_uint(PRINT_ANY, "hwid", ":%u ", hwid); + close_json_object(); + close_json_array(PRINT_JSON, name); +} + static void lwtunnel_print_opts(struct rtattr *attr) { struct rtattr *tb_opt[LWTUNNEL_IP_OPTS_MAX + 1]; @@ -362,6 +394,8 @@ static void lwtunnel_print_opts(struct rtattr *attr) lwtunnel_print_geneve_opts(tb_opt[LWTUNNEL_IP_OPTS_GENEVE]); else if (tb_opt[LWTUNNEL_IP_OPTS_VXLAN]) lwtunnel_print_vxlan_opts(tb_opt[LWTUNNEL_IP_OPTS_VXLAN]); + else if (tb_opt[LWTUNNEL_IP_OPTS_ERSPAN]) + lwtunnel_print_erspan_opts(tb_opt[LWTUNNEL_IP_OPTS_ERSPAN]); } static void print_encap_ip(FILE *fp, struct rtattr *encap) @@ -976,6 +1010,82 @@ static int lwtunnel_parse_vxlan_opts(char *str, size_t len, struct rtattr *rta) return 0; } +static int lwtunnel_parse_erspan_opts(char *str, size_t len, struct rtattr *rta) +{ + struct rtattr *nest; + char *token; + int i, err; + + nest = rta_nest(rta, len, LWTUNNEL_IP_OPTS_ERSPAN | NLA_F_NESTED); + i = 1; + token = strsep(&str, ":"); + while (token) { + switch (i) { + case LWTUNNEL_IP_OPT_ERSPAN_VER: + { + __u8 opt_type; + + if (!strlen(token)) + break; + err = get_u8(&opt_type, token, 0); + if (err) + return err; + + rta_addattr8(rta, len, i, opt_type); + break; + } + case LWTUNNEL_IP_OPT_ERSPAN_INDEX: + { + __be32 opt_class; + + if (!strlen(token)) + break; + err = get_be32(&opt_class, token, 0); + if (err) + return err; + + rta_addattr32(rta, len, i, opt_class); + break; + } + case LWTUNNEL_IP_OPT_ERSPAN_DIR: + { + __u8 opt_type; + + if (!strlen(token)) + break; + err = get_u8(&opt_type, token, 0); + if (err) + return err; + + rta_addattr8(rta, len, i, opt_type); + break; + } + case LWTUNNEL_IP_OPT_ERSPAN_HWID: + { + __u8 opt_type; + + if (!strlen(token)) + break; + err = get_u8(&opt_type, token, 0); + if (err) + return err; + + rta_addattr8(rta, len, i, opt_type); + break; + } + default: + fprintf(stderr, "Unknown \"geneve_opts\" type\n"); + return -1; + } + + token = strsep(&str, ":"); + i++; + } + + rta_nest_end(rta, nest); + return 0; +} + static int parse_encap_ip(struct rtattr *rta, size_t len, int *argcp, char ***argvp) { @@ -1062,6 +1172,21 @@ static int parse_encap_ip(struct rtattr *rta, size_t len, invarg("\"vxlan_opts\" value is invalid\n", *argv); rta_nest_end(rta, nest); + } else if (strcmp(*argv, "erspan_opts") == 0) { + struct rtattr *nest; + + if (opts_ok++) + duparg2("opts", *argv); + + NEXT_ARG(); + + nest = rta_nest(rta, len, + LWTUNNEL_IP_OPTS | NLA_F_NESTED); + ret = lwtunnel_parse_erspan_opts(*argv, len, rta); + if (ret) + invarg("\"erspan_opts\" value is invalid\n", + *argv); + rta_nest_end(rta, nest); } else if (strcmp(*argv, "key") == 0) { if (key_ok++) duparg2("key", *argv); @@ -1261,6 +1386,21 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len, invarg("\"vxlan_opts\" value is invalid\n", *argv); rta_nest_end(rta, nest); + } else if (strcmp(*argv, "erspan_opts") == 0) { + struct rtattr *nest; + + if (opts_ok++) + duparg2("opts", *argv); + + NEXT_ARG(); + + nest = rta_nest(rta, len, + LWTUNNEL_IP_OPTS | NLA_F_NESTED); + ret = lwtunnel_parse_erspan_opts(*argv, len, rta); + if (ret) + invarg("\"erspan_opts\" value is invalid\n", + *argv); + rta_nest_end(rta, nest); } else if (strcmp(*argv, "key") == 0) { if (key_ok++) duparg2("key", *argv); From f72c3ad00f3b7869e90840d0098a83cb88224892 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 27 Apr 2020 18:27:48 +0800 Subject: [PATCH 14/33] tc: m_tunnel_key: add options support for vxlan This patch is to add TCA_TUNNEL_KEY_ENC_OPTS_VXLAN's parse and print to implement vxlan options support in m_tunnel_key, like Commit 6217917a3826 ("tc: m_tunnel_key: Add tunnel option support to act_tunnel_key") for geneve options support. Option is expressed a 32bit number for gbp only, and vxlan doesn't support multiple options. With this patch, users can add and dump vxlan options like: # ip link add name vxlan1 type vxlan dstport 0 external # tc qdisc add dev eth0 ingress # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_proto udp \ action tunnel_key \ set src_ip 10.0.99.192 \ dst_ip 10.0.99.193 \ dst_port 6081 \ id 11 \ vxlan_opts 65793 \ action mirred egress redirect dev vxlan1 # tc -s filter show dev eth0 parent ffff: filter protocol ip pref 49152 flower chain 0 handle 0x1 indev eth0 eth_type ipv4 ip_proto udp not_in_hw action order 1: tunnel_key set src_ip 10.0.99.192 dst_ip 10.0.99.193 key_id 11 dst_port 6081 vxlan_opts 65793 ... v1->v2: - get_u32 with base = 0 for gbp. - use to print_unint("0x%x") to print gbp. v2->v3: - implement proper JSON array for opts. v3->v4: - keep the same format between input and output, json and non json. - print gbp as uint. Signed-off-by: Xin Long Signed-off-by: David Ahern --- man/man8/tc-tunnel_key.8 | 10 ++++- tc/m_tunnel_key.c | 84 +++++++++++++++++++++++++++++++++++----- 2 files changed, 84 insertions(+), 10 deletions(-) diff --git a/man/man8/tc-tunnel_key.8 b/man/man8/tc-tunnel_key.8 index 2145eb62..c208e2c8 100644 --- a/man/man8/tc-tunnel_key.8 +++ b/man/man8/tc-tunnel_key.8 @@ -66,8 +66,10 @@ options. .B id , .B dst_port -and +, .B geneve_opts +and +.B vxlan_opts are optional. .RS .TP @@ -91,6 +93,12 @@ is specified in the form CLASS:TYPE:DATA, where CLASS is represented as a variable length hexadecimal value. Additionally multiple options may be listed using a comma delimiter. .TP +.B vxlan_opts +Vxlan metatdata options. +.B vxlan_opts +is specified in the form GBP, as a 32bit number. Multiple options is not +supported. +.TP .B tos Outer header TOS .TP diff --git a/tc/m_tunnel_key.c b/tc/m_tunnel_key.c index 8fde6891..76391d6c 100644 --- a/tc/m_tunnel_key.c +++ b/tc/m_tunnel_key.c @@ -29,7 +29,7 @@ static void explain(void) "src_ip (mandatory)\n" "dst_ip (mandatory)\n" "dst_port \n" - "geneve_opts \n" + "geneve_opts | vxlan_opts \n" "csum | nocsum (default is \"csum\")\n"); } @@ -112,6 +112,21 @@ static int tunnel_key_parse_u8(char *str, int base, int type, return 0; } +static int tunnel_key_parse_u32(char *str, int base, int type, + struct nlmsghdr *n) +{ + __u32 value; + int ret; + + ret = get_u32(&value, str, base); + if (ret) + return ret; + + addattr32(n, MAX_MSG, type, value); + + return 0; +} + static int tunnel_key_parse_geneve_opt(char *str, struct nlmsghdr *n) { char *token, *saveptr = NULL; @@ -190,6 +205,27 @@ static int tunnel_key_parse_geneve_opts(char *str, struct nlmsghdr *n) return 0; } +static int tunnel_key_parse_vxlan_opt(char *str, struct nlmsghdr *n) +{ + struct rtattr *encap, *nest; + int ret; + + encap = addattr_nest(n, MAX_MSG, + TCA_TUNNEL_KEY_ENC_OPTS | NLA_F_NESTED); + nest = addattr_nest(n, MAX_MSG, + TCA_TUNNEL_KEY_ENC_OPTS_VXLAN | NLA_F_NESTED); + + ret = tunnel_key_parse_u32(str, 0, + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, n); + if (ret) + return ret; + + addattr_nest_end(n, nest); + addattr_nest_end(n, encap); + + return 0; +} + static int tunnel_key_parse_tos_ttl(char *str, int type, struct nlmsghdr *n) { int ret; @@ -287,6 +323,13 @@ static int parse_tunnel_key(struct action_util *a, int *argc_p, char ***argv_p, fprintf(stderr, "Illegal \"geneve_opts\"\n"); return -1; } + } else if (matches(*argv, "vxlan_opts") == 0) { + NEXT_ARG(); + + if (tunnel_key_parse_vxlan_opt(*argv, n)) { + fprintf(stderr, "Illegal \"vxlan_opts\"\n"); + return -1; + } } else if (matches(*argv, "tos") == 0) { NEXT_ARG(); ret = tunnel_key_parse_tos_ttl(*argv, @@ -406,13 +449,13 @@ static void tunnel_key_print_flag(FILE *f, const char *name_on, rta_getattr_u8(attr) ? name_on : name_off); } -static void tunnel_key_print_geneve_options(const char *name, - struct rtattr *attr) +static void tunnel_key_print_geneve_options(struct rtattr *attr) { struct rtattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1]; struct rtattr *i = RTA_DATA(attr); int ii, data_len = 0, offset = 0; int rem = RTA_PAYLOAD(attr); + char *name = "geneve_opts"; char strbuf[rem * 2 + 1]; char data[rem * 2 + 1]; uint8_t data_r[rem]; @@ -421,7 +464,7 @@ static void tunnel_key_print_geneve_options(const char *name, open_json_array(PRINT_JSON, name); print_nl(); - print_string(PRINT_FP, name, "\t%s ", "geneve_opt"); + print_string(PRINT_FP, name, "\t%s ", name); while (rem) { parse_rtattr(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX, i, rem); @@ -454,7 +497,27 @@ static void tunnel_key_print_geneve_options(const char *name, close_json_array(PRINT_JSON, name); } -static void tunnel_key_print_key_opt(const char *name, struct rtattr *attr) +static void tunnel_key_print_vxlan_options(struct rtattr *attr) +{ + struct rtattr *tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1]; + struct rtattr *i = RTA_DATA(attr); + int rem = RTA_PAYLOAD(attr); + char *name = "vxlan_opts"; + __u32 gbp; + + parse_rtattr(tb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, i, rem); + gbp = rta_getattr_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]); + + print_nl(); + print_string(PRINT_FP, name, "\t%s ", name); + open_json_array(PRINT_JSON, name); + open_json_object(NULL); + print_uint(PRINT_ANY, "gbp", "%u", gbp); + close_json_object(); + close_json_array(PRINT_JSON, name); +} + +static void tunnel_key_print_key_opt(struct rtattr *attr) { struct rtattr *tb[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1]; @@ -462,8 +525,12 @@ static void tunnel_key_print_key_opt(const char *name, struct rtattr *attr) return; parse_rtattr_nested(tb, TCA_TUNNEL_KEY_ENC_OPTS_MAX, attr); - tunnel_key_print_geneve_options(name, - tb[TCA_TUNNEL_KEY_ENC_OPTS_GENEVE]); + if (tb[TCA_TUNNEL_KEY_ENC_OPTS_GENEVE]) + tunnel_key_print_geneve_options( + tb[TCA_TUNNEL_KEY_ENC_OPTS_GENEVE]); + else if (tb[TCA_TUNNEL_KEY_ENC_OPTS_VXLAN]) + tunnel_key_print_vxlan_options( + tb[TCA_TUNNEL_KEY_ENC_OPTS_VXLAN]); } static void tunnel_key_print_tos_ttl(FILE *f, char *name, @@ -519,8 +586,7 @@ static int print_tunnel_key(struct action_util *au, FILE *f, struct rtattr *arg) tb[TCA_TUNNEL_KEY_ENC_KEY_ID]); tunnel_key_print_dst_port(f, "dst_port", tb[TCA_TUNNEL_KEY_ENC_DST_PORT]); - tunnel_key_print_key_opt("geneve_opts", - tb[TCA_TUNNEL_KEY_ENC_OPTS]); + tunnel_key_print_key_opt(tb[TCA_TUNNEL_KEY_ENC_OPTS]); tunnel_key_print_flag(f, "nocsum", "csum", tb[TCA_TUNNEL_KEY_NO_CSUM]); tunnel_key_print_tos_ttl(f, "tos", From 668fd9b25d9eca3067040273239f7825db95442b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 27 Apr 2020 18:27:49 +0800 Subject: [PATCH 15/33] tc: m_tunnel_key: add options support for erpsan This patch is to add TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN's parse and print to implement erspan options support in m_tunnel_key, like Commit 6217917a3826 ("tc: m_tunnel_key: Add tunnel option support to act_tunnel_key") for geneve options support. Option is expressed as version:index:dir:hwid, dir and hwid will be parsed when version is 2, while index will be parsed when version is 1. erspan doesn't support multiple options. With this patch, users can add and dump erspan options like: # ip link add name erspan1 type erspan external # tc qdisc add dev eth0 ingress # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_proto udp \ action tunnel_key \ set src_ip 10.0.99.192 \ dst_ip 10.0.99.193 \ dst_port 6081 \ id 11 \ erspan_opts 1:2:0:0 \ action mirred egress redirect dev erspan1 # tc -s filter show dev eth0 parent ffff: filter protocol ip pref 49151 flower chain 0 handle 0x1 indev eth0 eth_type ipv4 ip_proto udp not_in_hw action order 1: tunnel_key set src_ip 10.0.99.192 dst_ip 10.0.99.193 key_id 11 dst_port 6081 erspan_opts 1:2:0:0 csum pipe index 2 ref 1 bind 1 ... v1->v2: - no change. v2->v3: - no change. v3->v4: - keep the same format between input and output, json and non json. - print version, index, dir and hwid as uint. Signed-off-by: Xin Long Signed-off-by: David Ahern --- man/man8/tc-tunnel_key.8 | 12 +++- tc/m_tunnel_key.c | 117 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 127 insertions(+), 2 deletions(-) diff --git a/man/man8/tc-tunnel_key.8 b/man/man8/tc-tunnel_key.8 index c208e2c8..ad997240 100644 --- a/man/man8/tc-tunnel_key.8 +++ b/man/man8/tc-tunnel_key.8 @@ -68,8 +68,10 @@ options. .B dst_port , .B geneve_opts -and +, .B vxlan_opts +and +.B erspan_opts are optional. .RS .TP @@ -99,6 +101,14 @@ Vxlan metatdata options. is specified in the form GBP, as a 32bit number. Multiple options is not supported. .TP +.B erspan_opts +Erspan metatdata options. +.B erspan_opts +is specified in the form VERSION:INDEX:DIR:HWID, where VERSION is represented +as a 8bit number, INDEX as an 32bit number, DIR and HWID as a 8bit number. +Multiple options is not supported. Note INDEX is used when VERSION is 1, +and DIR and HWID are used when VERSION is 2. +.TP .B tos Outer header TOS .TP diff --git a/tc/m_tunnel_key.c b/tc/m_tunnel_key.c index 76391d6c..a56fe244 100644 --- a/tc/m_tunnel_key.c +++ b/tc/m_tunnel_key.c @@ -29,7 +29,7 @@ static void explain(void) "src_ip (mandatory)\n" "dst_ip (mandatory)\n" "dst_port \n" - "geneve_opts | vxlan_opts \n" + "geneve_opts | vxlan_opts | erspan_opts \n" "csum | nocsum (default is \"csum\")\n"); } @@ -97,6 +97,21 @@ static int tunnel_key_parse_be16(char *str, int base, int type, return 0; } +static int tunnel_key_parse_be32(char *str, int base, int type, + struct nlmsghdr *n) +{ + __be32 value; + int ret; + + ret = get_be32(&value, str, base); + if (ret) + return ret; + + addattr32(n, MAX_MSG, type, value); + + return 0; +} + static int tunnel_key_parse_u8(char *str, int base, int type, struct nlmsghdr *n) { @@ -226,6 +241,63 @@ static int tunnel_key_parse_vxlan_opt(char *str, struct nlmsghdr *n) return 0; } +static int tunnel_key_parse_erspan_opt(char *str, struct nlmsghdr *n) +{ + char *token, *saveptr = NULL; + struct rtattr *encap, *nest; + int i, ret; + + encap = addattr_nest(n, MAX_MSG, + TCA_TUNNEL_KEY_ENC_OPTS | NLA_F_NESTED); + nest = addattr_nest(n, MAX_MSG, + TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN | NLA_F_NESTED); + + token = strtok_r(str, ":", &saveptr); + i = 1; + while (token) { + switch (i) { + case TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER: + { + ret = tunnel_key_parse_u8(token, 0, i, n); + if (ret) + return ret; + break; + } + case TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX: + { + ret = tunnel_key_parse_be32(token, 0, i, n); + if (ret) + return ret; + break; + } + case TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR: + { + ret = tunnel_key_parse_u8(token, 0, i, n); + if (ret) + return ret; + break; + } + case TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID: + { + ret = tunnel_key_parse_u8(token, 0, i, n); + if (ret) + return ret; + break; + } + default: + return -1; + } + + token = strtok_r(NULL, ":", &saveptr); + i++; + } + + addattr_nest_end(n, nest); + addattr_nest_end(n, encap); + + return 0; +} + static int tunnel_key_parse_tos_ttl(char *str, int type, struct nlmsghdr *n) { int ret; @@ -330,6 +402,13 @@ static int parse_tunnel_key(struct action_util *a, int *argc_p, char ***argv_p, fprintf(stderr, "Illegal \"vxlan_opts\"\n"); return -1; } + } else if (matches(*argv, "erspan_opts") == 0) { + NEXT_ARG(); + + if (tunnel_key_parse_erspan_opt(*argv, n)) { + fprintf(stderr, "Illegal \"erspan_opts\"\n"); + return -1; + } } else if (matches(*argv, "tos") == 0) { NEXT_ARG(); ret = tunnel_key_parse_tos_ttl(*argv, @@ -517,6 +596,39 @@ static void tunnel_key_print_vxlan_options(struct rtattr *attr) close_json_array(PRINT_JSON, name); } +static void tunnel_key_print_erspan_options(struct rtattr *attr) +{ + struct rtattr *tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1]; + struct rtattr *i = RTA_DATA(attr); + int rem = RTA_PAYLOAD(attr); + char *name = "erspan_opts"; + __u8 ver, hwid, dir; + __u32 idx; + + parse_rtattr(tb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, i, rem); + ver = rta_getattr_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]); + if (ver == 1) { + idx = rta_getattr_be32(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]); + dir = 0; + hwid = 0; + } else { + idx = 0; + dir = rta_getattr_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR]); + hwid = rta_getattr_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]); + } + + print_nl(); + print_string(PRINT_FP, name, "\t%s ", name); + open_json_array(PRINT_JSON, name); + open_json_object(NULL); + print_uint(PRINT_ANY, "ver", "%u", ver); + print_uint(PRINT_ANY, "index", ":%u", idx); + print_uint(PRINT_ANY, "dir", ":%u", dir); + print_uint(PRINT_ANY, "hwid", ":%u", hwid); + close_json_object(); + close_json_array(PRINT_JSON, name); +} + static void tunnel_key_print_key_opt(struct rtattr *attr) { struct rtattr *tb[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1]; @@ -531,6 +643,9 @@ static void tunnel_key_print_key_opt(struct rtattr *attr) else if (tb[TCA_TUNNEL_KEY_ENC_OPTS_VXLAN]) tunnel_key_print_vxlan_options( tb[TCA_TUNNEL_KEY_ENC_OPTS_VXLAN]); + else if (tb[TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN]) + tunnel_key_print_erspan_options( + tb[TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN]); } static void tunnel_key_print_tos_ttl(FILE *f, char *name, From 93c8d5f72f8ce4b98c68508e85457f83933302c0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 27 Apr 2020 18:27:50 +0800 Subject: [PATCH 16/33] tc: f_flower: add options support for vxlan This patch is to add TCA_FLOWER_KEY_ENC_OPTS_VXLAN's parse and print to implement vxlan options support in m_tunnel_key, like Commit 56155d4df86d ("tc: f_flower: add geneve option match support to flower") for geneve options support. Option is expressed a 32bit number for gbp only, and vxlan doesn't support multiple options. With this patch, users can add and dump vxlan options like: # ip link add name vxlan1 type vxlan dstport 0 external # tc qdisc add dev vxlan1 ingress # tc filter add dev vxlan1 protocol ip parent ffff: \ flower \ enc_src_ip 10.0.99.192 \ enc_dst_ip 10.0.99.193 \ enc_key_id 11 \ vxlan_opts 65793/4008635966 \ ip_proto udp \ action mirred egress redirect dev eth1 # tc -s filter show dev vxlan1 parent ffff: filter protocol ip pref 49152 flower chain 0 handle 0x1 eth_type ipv4 ip_proto udp enc_dst_ip 10.0.99.193 enc_src_ip 10.0.99.192 enc_key_id 11 vxlan_opts 65793/4008635966 not_in_hw action order 1: mirred (Egress Redirect to device eth1) stolen index 3 ref 1 bind 1 Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 v1->v2: - get_u32 with base = 0 for gbp. v2->v3: - implement proper JSON array for opts. v3->v4: - keep the same format between input and output, json and non json. - print gbp as uint. Signed-off-by: Xin Long Signed-off-by: David Ahern --- man/man8/tc-flower.8 | 12 ++++ tc/f_flower.c | 130 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 126 insertions(+), 16 deletions(-) diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8 index eb9eb5f0..3c7246b8 100644 --- a/man/man8/tc-flower.8 +++ b/man/man8/tc-flower.8 @@ -81,7 +81,11 @@ flower \- flow based traffic control filter .IR TOS " | " .B enc_ttl .IR TTL " | " +{ .B geneve_opts +| +.B vxlan_opts +} .IR OPTIONS " | " .BR ip_flags .IR IP_FLAGS @@ -326,6 +330,8 @@ Match the connection zone, and can be masked. .RE .TP .BI geneve_opts " OPTIONS" +.TQ +.BI vxlan_opts " OPTIONS" Match on IP tunnel metadata. Key id .I NUMBER is a 32 bit tunnel key id (e.g. VNI for VXLAN tunnel). @@ -346,6 +352,12 @@ the masks is missing, \fBtc\fR assumes a full-length match. The options can be described in the form CLASS:TYPE:DATA/CLASS_MASK:TYPE_MASK:DATA_MASK, where CLASS is represented as a 16bit hexadecimal value, TYPE as an 8bit hexadecimal value and DATA as a variable length hexadecimal value. +vxlan_opts +.I OPTIONS +doesn't support multiple options, and it consists of a key followed by a slash +and corresponding mask. If the mask is missing, \fBtc\fR assumes a full-length +match. The option can be described in the form GBP/GBP_MASK, where GBP is +represented as a 32bit number. .TP .BI ip_flags " IP_FLAGS" .I IP_FLAGS diff --git a/tc/f_flower.c b/tc/f_flower.c index 9d59d71f..502d2ad0 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -81,6 +81,7 @@ static void explain(void) " enc_tos MASKED-IP_TOS |\n" " enc_ttl MASKED-IP_TTL |\n" " geneve_opts MASKED-OPTIONS |\n" + " vxlan_opts MASKED-OPTIONS |\n" " ip_flags IP-FLAGS | \n" " enc_dst_port [ port_number ] |\n" " ct_state MASKED_CT_STATE |\n" @@ -847,7 +848,7 @@ static int flower_parse_enc_port(char *str, int type, struct nlmsghdr *n) return 0; } -static int flower_parse_geneve_opts(char *str, struct nlmsghdr *n) +static int flower_parse_geneve_opt(char *str, struct nlmsghdr *n) { struct rtattr *nest; char *token; @@ -917,14 +918,33 @@ static int flower_parse_geneve_opts(char *str, struct nlmsghdr *n) return 0; } -static int flower_parse_enc_opt_part(char *str, struct nlmsghdr *n) +static int flower_parse_vxlan_opt(char *str, struct nlmsghdr *n) +{ + struct rtattr *nest; + __u32 gbp; + int err; + + nest = addattr_nest(n, MAX_MSG, + TCA_FLOWER_KEY_ENC_OPTS_VXLAN | NLA_F_NESTED); + + err = get_u32(&gbp, str, 0); + if (err) + return err; + addattr32(n, MAX_MSG, TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, gbp); + + addattr_nest_end(n, nest); + + return 0; +} + +static int flower_parse_geneve_opts(char *str, struct nlmsghdr *n) { char *token; int err; token = strsep(&str, ","); while (token) { - err = flower_parse_geneve_opts(token, n); + err = flower_parse_geneve_opt(token, n); if (err) return err; @@ -954,7 +974,7 @@ static int flower_check_enc_opt_key(char *key) return 0; } -static int flower_parse_enc_opts(char *str, struct nlmsghdr *n) +static int flower_parse_enc_opts_geneve(char *str, struct nlmsghdr *n) { char key[XATTR_SIZE_MAX], mask[XATTR_SIZE_MAX]; int data_len, key_len, mask_len, err; @@ -1006,13 +1026,50 @@ static int flower_parse_enc_opts(char *str, struct nlmsghdr *n) mask[mask_len - 1] = '\0'; nest = addattr_nest(n, MAX_MSG, TCA_FLOWER_KEY_ENC_OPTS); - err = flower_parse_enc_opt_part(key, n); + err = flower_parse_geneve_opts(key, n); if (err) return err; addattr_nest_end(n, nest); nest = addattr_nest(n, MAX_MSG, TCA_FLOWER_KEY_ENC_OPTS_MASK); - err = flower_parse_enc_opt_part(mask, n); + err = flower_parse_geneve_opts(mask, n); + if (err) + return err; + addattr_nest_end(n, nest); + + return 0; +} + +static int flower_parse_enc_opts_vxlan(char *str, struct nlmsghdr *n) +{ + char key[XATTR_SIZE_MAX], mask[XATTR_SIZE_MAX]; + struct rtattr *nest; + char *slash; + int err; + + slash = strchr(str, '/'); + if (slash) { + *slash++ = '\0'; + if (strlen(slash) > XATTR_SIZE_MAX) + return -1; + strcpy(mask, slash); + } else { + strcpy(mask, "0xffffffff"); + } + + if (strlen(str) > XATTR_SIZE_MAX) + return -1; + strcpy(key, str); + + nest = addattr_nest(n, MAX_MSG, TCA_FLOWER_KEY_ENC_OPTS | NLA_F_NESTED); + err = flower_parse_vxlan_opt(str, n); + if (err) + return err; + addattr_nest_end(n, nest); + + nest = addattr_nest(n, MAX_MSG, + TCA_FLOWER_KEY_ENC_OPTS_MASK | NLA_F_NESTED); + err = flower_parse_vxlan_opt(mask, n); if (err) return err; addattr_nest_end(n, nest); @@ -1502,11 +1559,18 @@ static int flower_parse_opt(struct filter_util *qu, char *handle, } } else if (matches(*argv, "geneve_opts") == 0) { NEXT_ARG(); - ret = flower_parse_enc_opts(*argv, n); + ret = flower_parse_enc_opts_geneve(*argv, n); if (ret < 0) { fprintf(stderr, "Illegal \"geneve_opts\"\n"); return -1; } + } else if (matches(*argv, "vxlan_opts") == 0) { + NEXT_ARG(); + ret = flower_parse_enc_opts_vxlan(*argv, n); + if (ret < 0) { + fprintf(stderr, "Illegal \"vxlan_opts\"\n"); + return -1; + } } else if (matches(*argv, "action") == 0) { NEXT_ARG(); ret = parse_action(&argc, &argv, TCA_FLOWER_ACT, n); @@ -1940,10 +2004,29 @@ static void flower_print_geneve_opts(const char *name, struct rtattr *attr, close_json_array(PRINT_JSON, name); } -static void flower_print_geneve_parts(const char *name, struct rtattr *attr, - char *key, char *mask) +static void flower_print_vxlan_opts(const char *name, struct rtattr *attr, + char *strbuf) +{ + struct rtattr *tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1]; + struct rtattr *i = RTA_DATA(attr); + int rem = RTA_PAYLOAD(attr); + __u32 gbp; + + parse_rtattr(tb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, i, rem); + gbp = rta_getattr_u32(tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]); + + open_json_array(PRINT_JSON, name); + open_json_object(NULL); + print_uint(PRINT_JSON, "gbp", NULL, gbp); + close_json_object(); + close_json_array(PRINT_JSON, name); + + sprintf(strbuf, "%u", gbp); +} + +static void flower_print_enc_parts(const char *name, const char *namefrm, + struct rtattr *attr, char *key, char *mask) { - char *namefrm = " geneve_opt %s"; char *key_token, *mask_token, *out; int len; @@ -1985,14 +2068,29 @@ static void flower_print_enc_opts(const char *name, struct rtattr *attr, goto err_key_free; parse_rtattr_nested(key_tb, TCA_FLOWER_KEY_ENC_OPTS_MAX, attr); - flower_print_geneve_opts("geneve_opt_key", - key_tb[TCA_FLOWER_KEY_ENC_OPTS_GENEVE], key); - parse_rtattr_nested(msk_tb, TCA_FLOWER_KEY_ENC_OPTS_MAX, mask_attr); - flower_print_geneve_opts("geneve_opt_mask", - msk_tb[TCA_FLOWER_KEY_ENC_OPTS_GENEVE], msk); - flower_print_geneve_parts(name, attr, key, msk); + if (key_tb[TCA_FLOWER_KEY_ENC_OPTS_GENEVE]) { + flower_print_geneve_opts("geneve_opt_key", + key_tb[TCA_FLOWER_KEY_ENC_OPTS_GENEVE], key); + + if (msk_tb[TCA_FLOWER_KEY_ENC_OPTS_GENEVE]) + flower_print_geneve_opts("geneve_opt_mask", + msk_tb[TCA_FLOWER_KEY_ENC_OPTS_GENEVE], msk); + + flower_print_enc_parts(name, " geneve_opts %s", attr, key, + msk); + } else if (key_tb[TCA_FLOWER_KEY_ENC_OPTS_VXLAN]) { + flower_print_vxlan_opts("vxlan_opt_key", + key_tb[TCA_FLOWER_KEY_ENC_OPTS_VXLAN], key); + + if (msk_tb[TCA_FLOWER_KEY_ENC_OPTS_VXLAN]) + flower_print_vxlan_opts("vxlan_opt_mask", + msk_tb[TCA_FLOWER_KEY_ENC_OPTS_VXLAN], msk); + + flower_print_enc_parts(name, " vxlan_opts %s", attr, key, + msk); + } free(msk); err_key_free: From 4e578c78fedfe6ffa5fa5fde56778b264485829b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 27 Apr 2020 18:27:51 +0800 Subject: [PATCH 17/33] tc: f_flower: add options support for erspan This patch is to add TCA_FLOWER_KEY_ENC_OPTS_ERSPAN's parse and print to implement erspan options support in m_tunnel_key, like Commit 56155d4df86d ("tc: f_flower: add geneve option match support to flower") for geneve options support. Option is expressed as version:index:dir:hwid, dir and hwid will be parsed when version is 2, while index will be parsed when version is 1. erspan doesn't support multiple options. With this patch, users can add and dump erspan options like: # ip link add name erspan1 type erspan external # tc qdisc add dev erspan1 ingress # tc filter add dev erspan1 protocol ip parent ffff: \ flower \ enc_src_ip 10.0.99.192 \ enc_dst_ip 10.0.99.193 \ enc_key_id 11 \ erspan_opts 1:2:0:0/1:255:0:0 \ ip_proto udp \ action mirred egress redirect dev eth1 # tc -s filter show dev erspan1 parent ffff: filter protocol ip pref 49152 flower chain 0 handle 0x1 eth_type ipv4 ip_proto udp enc_dst_ip 10.0.99.193 enc_src_ip 10.0.99.192 enc_key_id 11 erspan_opts 1:2:0:0/1:255:0:0 not_in_hw action order 1: mirred (Egress Redirect to device eth1) stolen index 1 ref 1 bind 1 Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 v1->v2: - no change. v2->v3: - no change. v3->v4: - keep the same format between input and output, json and non json. - print version, index, dir and hwid as uint. Signed-off-by: Xin Long Signed-off-by: David Ahern --- man/man8/tc-flower.8 | 13 ++++ tc/f_flower.c | 171 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 184 insertions(+) diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8 index 3c7246b8..b3dfcf68 100644 --- a/man/man8/tc-flower.8 +++ b/man/man8/tc-flower.8 @@ -85,6 +85,8 @@ flower \- flow based traffic control filter .B geneve_opts | .B vxlan_opts +| +.B erspan_opts } .IR OPTIONS " | " .BR ip_flags @@ -332,6 +334,8 @@ Match the connection zone, and can be masked. .BI geneve_opts " OPTIONS" .TQ .BI vxlan_opts " OPTIONS" +.TQ +.BI erspan_opts " OPTIONS" Match on IP tunnel metadata. Key id .I NUMBER is a 32 bit tunnel key id (e.g. VNI for VXLAN tunnel). @@ -358,6 +362,15 @@ doesn't support multiple options, and it consists of a key followed by a slash and corresponding mask. If the mask is missing, \fBtc\fR assumes a full-length match. The option can be described in the form GBP/GBP_MASK, where GBP is represented as a 32bit number. +erspan_opts +.I OPTIONS +doesn't support multiple options, and it consists of a key followed by a slash +and corresponding mask. If the mask is missing, \fBtc\fR assumes a full-length +match. The option can be described in the form +VERSION:INDEX:DIR:HWID/VERSION:INDEX_MASK:DIR_MASK:HWID_MASK, where VERSION is +represented as a 8bit number, INDEX as an 32bit number, DIR and HWID as a 8bit +number. Multiple options is not supported. Note INDEX/INDEX_MASK is used when +VERSION is 1, and DIR/DIR_MASK and HWID/HWID_MASK are used when VERSION is 2. .TP .BI ip_flags " IP_FLAGS" .I IP_FLAGS diff --git a/tc/f_flower.c b/tc/f_flower.c index 502d2ad0..fc136911 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -82,6 +82,7 @@ static void explain(void) " enc_ttl MASKED-IP_TTL |\n" " geneve_opts MASKED-OPTIONS |\n" " vxlan_opts MASKED-OPTIONS |\n" + " erspan_opts MASKED-OPTIONS |\n" " ip_flags IP-FLAGS | \n" " enc_dst_port [ port_number ] |\n" " ct_state MASKED_CT_STATE |\n" @@ -937,6 +938,84 @@ static int flower_parse_vxlan_opt(char *str, struct nlmsghdr *n) return 0; } +static int flower_parse_erspan_opt(char *str, struct nlmsghdr *n) +{ + struct rtattr *nest; + char *token; + int i, err; + + nest = addattr_nest(n, MAX_MSG, + TCA_FLOWER_KEY_ENC_OPTS_ERSPAN | NLA_F_NESTED); + + i = 1; + token = strsep(&str, ":"); + while (token) { + switch (i) { + case TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER: + { + __u8 opt_type; + + if (!strlen(token)) + break; + err = get_u8(&opt_type, token, 0); + if (err) + return err; + + addattr8(n, MAX_MSG, i, opt_type); + break; + } + case TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX: + { + __be32 opt_index; + + if (!strlen(token)) + break; + err = get_be32(&opt_index, token, 0); + if (err) + return err; + + addattr32(n, MAX_MSG, i, opt_index); + break; + } + case TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR: + { + __u8 opt_type; + + if (!strlen(token)) + break; + err = get_u8(&opt_type, token, 0); + if (err) + return err; + + addattr8(n, MAX_MSG, i, opt_type); + break; + } + case TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID: + { + __u8 opt_type; + + if (!strlen(token)) + break; + err = get_u8(&opt_type, token, 0); + if (err) + return err; + + addattr8(n, MAX_MSG, i, opt_type); + break; + } + default: + fprintf(stderr, "Unknown \"geneve_opts\" type\n"); + return -1; + } + + token = strsep(&str, ":"); + i++; + } + addattr_nest_end(n, nest); + + return 0; +} + static int flower_parse_geneve_opts(char *str, struct nlmsghdr *n) { char *token; @@ -1077,6 +1156,49 @@ static int flower_parse_enc_opts_vxlan(char *str, struct nlmsghdr *n) return 0; } +static int flower_parse_enc_opts_erspan(char *str, struct nlmsghdr *n) +{ + char key[XATTR_SIZE_MAX], mask[XATTR_SIZE_MAX]; + struct rtattr *nest; + char *slash; + int err; + + + slash = strchr(str, '/'); + if (slash) { + *slash++ = '\0'; + if (strlen(slash) > XATTR_SIZE_MAX) + return -1; + strcpy(mask, slash); + } else { + int index; + + slash = strchr(str, ':'); + index = (int)(slash - str); + memcpy(mask, str, index); + strcpy(mask + index, ":0xffffffff:0xff:0xff"); + } + + if (strlen(str) > XATTR_SIZE_MAX) + return -1; + strcpy(key, str); + + nest = addattr_nest(n, MAX_MSG, TCA_FLOWER_KEY_ENC_OPTS | NLA_F_NESTED); + err = flower_parse_erspan_opt(key, n); + if (err) + return err; + addattr_nest_end(n, nest); + + nest = addattr_nest(n, MAX_MSG, + TCA_FLOWER_KEY_ENC_OPTS_MASK | NLA_F_NESTED); + err = flower_parse_erspan_opt(mask, n); + if (err) + return err; + addattr_nest_end(n, nest); + + return 0; +} + static int flower_parse_opt(struct filter_util *qu, char *handle, int argc, char **argv, struct nlmsghdr *n) { @@ -1571,6 +1693,13 @@ static int flower_parse_opt(struct filter_util *qu, char *handle, fprintf(stderr, "Illegal \"vxlan_opts\"\n"); return -1; } + } else if (matches(*argv, "erspan_opts") == 0) { + NEXT_ARG(); + ret = flower_parse_enc_opts_erspan(*argv, n); + if (ret < 0) { + fprintf(stderr, "Illegal \"erspan_opts\"\n"); + return -1; + } } else if (matches(*argv, "action") == 0) { NEXT_ARG(); ret = parse_action(&argc, &argv, TCA_FLOWER_ACT, n); @@ -2024,6 +2153,38 @@ static void flower_print_vxlan_opts(const char *name, struct rtattr *attr, sprintf(strbuf, "%u", gbp); } +static void flower_print_erspan_opts(const char *name, struct rtattr *attr, + char *strbuf) +{ + struct rtattr *tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1]; + __u8 ver, hwid, dir; + __u32 idx; + + parse_rtattr(tb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, RTA_DATA(attr), + RTA_PAYLOAD(attr)); + ver = rta_getattr_u8(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]); + if (ver == 1) { + idx = rta_getattr_be32(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]); + hwid = 0; + dir = 0; + } else { + idx = 0; + hwid = rta_getattr_u8(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]); + dir = rta_getattr_u8(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]); + } + + open_json_array(PRINT_JSON, name); + open_json_object(NULL); + print_uint(PRINT_JSON, "ver", NULL, ver); + print_uint(PRINT_JSON, "index", NULL, idx); + print_uint(PRINT_JSON, "dir", NULL, dir); + print_uint(PRINT_JSON, "hwid", NULL, hwid); + close_json_object(); + close_json_array(PRINT_JSON, name); + + sprintf(strbuf, "%u:%u:%u:%u", ver, idx, dir, hwid); +} + static void flower_print_enc_parts(const char *name, const char *namefrm, struct rtattr *attr, char *key, char *mask) { @@ -2090,6 +2251,16 @@ static void flower_print_enc_opts(const char *name, struct rtattr *attr, flower_print_enc_parts(name, " vxlan_opts %s", attr, key, msk); + } else if (key_tb[TCA_FLOWER_KEY_ENC_OPTS_ERSPAN]) { + flower_print_erspan_opts("erspan_opt_key", + key_tb[TCA_FLOWER_KEY_ENC_OPTS_ERSPAN], key); + + if (msk_tb[TCA_FLOWER_KEY_ENC_OPTS_ERSPAN]) + flower_print_erspan_opts("erspan_opt_mask", + msk_tb[TCA_FLOWER_KEY_ENC_OPTS_ERSPAN], msk); + + flower_print_enc_parts(name, " erspan_opts %s", attr, key, + msk); } free(msk); From ae57e82da01111f680b4d0543fb1fe3cf9f74571 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 5 May 2020 16:11:22 +0000 Subject: [PATCH 18/33] Update kernel headers Update kernel headers to commit: 354d86141796 ("Merge branch 'net-reduce-dynamic-lockdep-keys'") Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 69 +++++++++++++++++++++- include/uapi/linux/genetlink.h | 2 + include/uapi/linux/inet_diag.h | 2 + include/uapi/linux/netlink.h | 103 +++++++++++++++++++++++++++++++++ include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/pkt_sched.h | 6 ++ 6 files changed, 180 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 60684b7b..518bea7c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -113,6 +113,9 @@ enum bpf_cmd { BPF_MAP_DELETE_BATCH, BPF_LINK_CREATE, BPF_LINK_UPDATE, + BPF_LINK_GET_FD_BY_ID, + BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, }; enum bpf_map_type { @@ -220,6 +223,15 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + + MAX_BPF_LINK_TYPE, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. @@ -379,6 +391,12 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, @@ -523,6 +541,7 @@ union bpf_attr { __u32 prog_id; __u32 map_id; __u32 btf_id; + __u32 link_id; }; __u32 next_id; __u32 open_flags; @@ -589,6 +608,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF @@ -652,6 +675,8 @@ union bpf_attr { * u64 bpf_ktime_get_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: clock_gettime(CLOCK_MONOTONIC) * Return * Current *ktime*. * @@ -1562,7 +1587,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1570,6 +1595,11 @@ union bpf_attr { * must be specified, see **setsockopt(2)** for more information. * The option value of length *optlen* is pointed by *optval*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: * @@ -1764,7 +1794,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1773,6 +1803,11 @@ union bpf_attr { * The retrieved value is stored in the structure pointed by * *opval* and of length *optlen*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **getsockopt()**. * It supports the following *level*\ s: * @@ -3025,6 +3060,14 @@ union bpf_attr { * * **-EOPNOTSUPP** Unsupported operation, for example a * call from outside of TC ingress. * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: clock_gettime(CLOCK_BOOTTIME) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3151,7 +3194,8 @@ union bpf_attr { FN(xdp_output), \ FN(get_netns_cookie), \ FN(get_current_ancestor_cgroup_id), \ - FN(sk_assign), + FN(sk_assign), \ + FN(ktime_get_boot_ns), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3598,6 +3642,25 @@ struct bpf_btf_info { __u32 id; } __attribute__((aligned(8))); +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + }; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach attach type). diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index 1317119c..7c6c390c 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -48,6 +48,7 @@ enum { CTRL_CMD_NEWMCAST_GRP, CTRL_CMD_DELMCAST_GRP, CTRL_CMD_GETMCAST_GRP, /* unused */ + CTRL_CMD_GETPOLICY, __CTRL_CMD_MAX, }; @@ -62,6 +63,7 @@ enum { CTRL_ATTR_MAXATTR, CTRL_ATTR_OPS, CTRL_ATTR_MCAST_GROUPS, + CTRL_ATTR_POLICY, __CTRL_ATTR_MAX, }; diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 0c1c781c..f009abf1 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -96,6 +96,7 @@ enum { INET_DIAG_BC_MARK_COND, INET_DIAG_BC_S_EQ, INET_DIAG_BC_D_EQ, + INET_DIAG_BC_CGROUP_COND, /* u64 cgroup v2 ID */ }; struct inet_diag_hostcond { @@ -157,6 +158,7 @@ enum { INET_DIAG_MD5SIG, INET_DIAG_ULP_INFO, INET_DIAG_SK_BPF_STORAGES, + INET_DIAG_CGROUP_ID, __INET_DIAG_MAX, }; diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 2c28d329..695c88e3 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -245,4 +245,107 @@ struct nla_bitfield32 { __u32 selector; }; +/* + * policy descriptions - it's specific to each family how this is used + * Normally, it should be retrieved via a dump inside another attribute + * specifying where it applies. + */ + +/** + * enum netlink_attribute_type - type of an attribute + * @NL_ATTR_TYPE_INVALID: unused + * @NL_ATTR_TYPE_FLAG: flag attribute (present/not present) + * @NL_ATTR_TYPE_U8: 8-bit unsigned attribute + * @NL_ATTR_TYPE_U16: 16-bit unsigned attribute + * @NL_ATTR_TYPE_U32: 32-bit unsigned attribute + * @NL_ATTR_TYPE_U64: 64-bit unsigned attribute + * @NL_ATTR_TYPE_S8: 8-bit signed attribute + * @NL_ATTR_TYPE_S16: 16-bit signed attribute + * @NL_ATTR_TYPE_S32: 32-bit signed attribute + * @NL_ATTR_TYPE_S64: 64-bit signed attribute + * @NL_ATTR_TYPE_BINARY: binary data, min/max length may be specified + * @NL_ATTR_TYPE_STRING: string, min/max length may be specified + * @NL_ATTR_TYPE_NUL_STRING: NUL-terminated string, + * min/max length may be specified + * @NL_ATTR_TYPE_NESTED: nested, i.e. the content of this attribute + * consists of sub-attributes. The nested policy and maxtype + * inside may be specified. + * @NL_ATTR_TYPE_NESTED_ARRAY: nested array, i.e. the content of this + * attribute contains sub-attributes whose type is irrelevant + * (just used to separate the array entries) and each such array + * entry has attributes again, the policy for those inner ones + * and the corresponding maxtype may be specified. + * @NL_ATTR_TYPE_BITFIELD32: &struct nla_bitfield32 attribute + */ +enum netlink_attribute_type { + NL_ATTR_TYPE_INVALID, + + NL_ATTR_TYPE_FLAG, + + NL_ATTR_TYPE_U8, + NL_ATTR_TYPE_U16, + NL_ATTR_TYPE_U32, + NL_ATTR_TYPE_U64, + + NL_ATTR_TYPE_S8, + NL_ATTR_TYPE_S16, + NL_ATTR_TYPE_S32, + NL_ATTR_TYPE_S64, + + NL_ATTR_TYPE_BINARY, + NL_ATTR_TYPE_STRING, + NL_ATTR_TYPE_NUL_STRING, + + NL_ATTR_TYPE_NESTED, + NL_ATTR_TYPE_NESTED_ARRAY, + + NL_ATTR_TYPE_BITFIELD32, +}; + +/** + * enum netlink_policy_type_attr - policy type attributes + * @NL_POLICY_TYPE_ATTR_UNSPEC: unused + * @NL_POLICY_TYPE_ATTR_TYPE: type of the attribute, + * &enum netlink_attribute_type (U32) + * @NL_POLICY_TYPE_ATTR_MIN_VALUE_S: minimum value for signed + * integers (S64) + * @NL_POLICY_TYPE_ATTR_MAX_VALUE_S: maximum value for signed + * integers (S64) + * @NL_POLICY_TYPE_ATTR_MIN_VALUE_U: minimum value for unsigned + * integers (U64) + * @NL_POLICY_TYPE_ATTR_MAX_VALUE_U: maximum value for unsigned + * integers (U64) + * @NL_POLICY_TYPE_ATTR_MIN_LENGTH: minimum length for binary + * attributes, no minimum if not given (U32) + * @NL_POLICY_TYPE_ATTR_MAX_LENGTH: maximum length for binary + * attributes, no maximum if not given (U32) + * @NL_POLICY_TYPE_ATTR_POLICY_IDX: sub policy for nested and + * nested array types (U32) + * @NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE: maximum sub policy + * attribute for nested and nested array types, this can + * in theory be < the size of the policy pointed to by + * the index, if limited inside the nesting (U32) + * @NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: valid mask for the + * bitfield32 type (U32) + * @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment + */ +enum netlink_policy_type_attr { + NL_POLICY_TYPE_ATTR_UNSPEC, + NL_POLICY_TYPE_ATTR_TYPE, + NL_POLICY_TYPE_ATTR_MIN_VALUE_S, + NL_POLICY_TYPE_ATTR_MAX_VALUE_S, + NL_POLICY_TYPE_ATTR_MIN_VALUE_U, + NL_POLICY_TYPE_ATTR_MAX_VALUE_U, + NL_POLICY_TYPE_ATTR_MIN_LENGTH, + NL_POLICY_TYPE_ATTR_MAX_LENGTH, + NL_POLICY_TYPE_ATTR_POLICY_IDX, + NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, + NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, + NL_POLICY_TYPE_ATTR_PAD, + + /* keep last */ + __NL_POLICY_TYPE_ATTR_MAX, + NL_POLICY_TYPE_ATTR_MAX = __NL_POLICY_TYPE_ATTR_MAX - 1 +}; + #endif /* __LINUX_NETLINK_H */ diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 9f06d29c..fc672b23 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -134,6 +134,7 @@ enum tca_id { TCA_ID_CTINFO, TCA_ID_MPLS, TCA_ID_CT, + TCA_ID_GATE, /* other actions go here */ __TCA_ID_MAX = 255 }; diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 0c02737c..a95f3ae7 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -913,6 +913,10 @@ enum { TCA_FQ_TIMER_SLACK, /* timer slack */ + TCA_FQ_HORIZON, /* time horizon in us */ + + TCA_FQ_HORIZON_DROP, /* drop packets beyond horizon, or cap their EDT */ + __TCA_FQ_MAX }; @@ -932,6 +936,8 @@ struct tc_fq_qd_stats { __u32 throttled_flows; __u32 unthrottle_latency_ns; __u64 ce_mark; /* packets above ce_threshold */ + __u64 horizon_drops; + __u64 horizon_caps; }; /* Heavy-Hitter Filter */ From 3175bca7182b0867c6e9a3d5d1551fdecf70118c Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 30 Apr 2020 20:03:30 +0200 Subject: [PATCH 19/33] tc: full JSON support for 'bpf' filter example using eBPF: # tc filter add dev dummy0 ingress bpf \ > direct-action obj ./bpf/filter.o sec tc-ingress # tc -j filter show dev dummy0 ingress | jq [ { "protocol": "all", "pref": 49152, "kind": "bpf", "chain": 0 }, { "protocol": "all", "pref": 49152, "kind": "bpf", "chain": 0, "options": { "handle": "0x1", "bpf_name": "filter.o:[tc-ingress]", "direct-action": true, "not_in_hw": true, "prog": { "id": 101, "tag": "a04f5eef06a7f555", "jited": 1 } } } ] v2: - use print_nl(), thanks to Andrea Claudi - use print_0xhex() for filter handle, thanks to Stephen Hemminger Signed-off-by: Davide Caratti Acked-by: Andrea Claudi Signed-off-by: David Ahern --- tc/f_bpf.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tc/f_bpf.c b/tc/f_bpf.c index 135271aa..fa3552ae 100644 --- a/tc/f_bpf.c +++ b/tc/f_bpf.c @@ -203,22 +203,24 @@ static int bpf_print_opt(struct filter_util *qu, FILE *f, parse_rtattr_nested(tb, TCA_BPF_MAX, opt); if (handle) - fprintf(f, "handle 0x%x ", handle); + print_0xhex(PRINT_ANY, "handle", "handle %#llx ", handle); if (tb[TCA_BPF_CLASSID]) { SPRINT_BUF(b1); - fprintf(f, "flowid %s ", + print_string(PRINT_ANY, "flowid", "flowid %s ", sprint_tc_classid(rta_getattr_u32(tb[TCA_BPF_CLASSID]), b1)); } if (tb[TCA_BPF_NAME]) - fprintf(f, "%s ", rta_getattr_str(tb[TCA_BPF_NAME])); + print_string(PRINT_ANY, "bpf_name", "%s ", + rta_getattr_str(tb[TCA_BPF_NAME])); if (tb[TCA_BPF_FLAGS]) { unsigned int flags = rta_getattr_u32(tb[TCA_BPF_FLAGS]); if (flags & TCA_BPF_FLAG_ACT_DIRECT) - fprintf(f, "direct-action "); + print_bool(PRINT_ANY, + "direct-action", "direct-action ", true); } if (tb[TCA_BPF_FLAGS_GEN]) { @@ -226,14 +228,14 @@ static int bpf_print_opt(struct filter_util *qu, FILE *f, rta_getattr_u32(tb[TCA_BPF_FLAGS_GEN]); if (flags & TCA_CLS_FLAGS_SKIP_HW) - fprintf(f, "skip_hw "); + print_bool(PRINT_ANY, "skip_hw", "skip_hw ", true); if (flags & TCA_CLS_FLAGS_SKIP_SW) - fprintf(f, "skip_sw "); - + print_bool(PRINT_ANY, "skip_sw", "skip_sw ", true); if (flags & TCA_CLS_FLAGS_IN_HW) - fprintf(f, "in_hw "); + print_bool(PRINT_ANY, "in_hw", "in_hw ", true); else if (flags & TCA_CLS_FLAGS_NOT_IN_HW) - fprintf(f, "not_in_hw "); + print_bool(PRINT_ANY, + "not_in_hw", "not_in_hw ", true); } if (tb[TCA_BPF_OPS] && tb[TCA_BPF_OPS_LEN]) @@ -245,14 +247,13 @@ static int bpf_print_opt(struct filter_util *qu, FILE *f, if (!dump_ok && tb[TCA_BPF_TAG]) { SPRINT_BUF(b); - fprintf(f, "tag %s ", - hexstring_n2a(RTA_DATA(tb[TCA_BPF_TAG]), - RTA_PAYLOAD(tb[TCA_BPF_TAG]), - b, sizeof(b))); + print_string(PRINT_ANY, "tag", "tag %s ", + hexstring_n2a(RTA_DATA(tb[TCA_BPF_TAG]), + RTA_PAYLOAD(tb[TCA_BPF_TAG]), b, sizeof(b))); } if (tb[TCA_BPF_POLICE]) { - fprintf(f, "\n"); + print_nl(); tc_print_police(f, tb[TCA_BPF_POLICE]); } From c1b21f528687f913535621a898b94bbc083b94f7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 5 May 2020 16:23:14 +0000 Subject: [PATCH 20/33] Import rpl.h and rpl_iptunnel.h uapi headers Import rpl.h and rpl_iptunnel.h as of kernel commit: 354d86141796 ("Merge branch 'net-reduce-dynamic-lockdep-keys'") Signed-off-by: David Ahern --- include/uapi/linux/rpl.h | 48 +++++++++++++++++++++++++++++++ include/uapi/linux/rpl_iptunnel.h | 21 ++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 include/uapi/linux/rpl.h create mode 100644 include/uapi/linux/rpl_iptunnel.h diff --git a/include/uapi/linux/rpl.h b/include/uapi/linux/rpl.h new file mode 100644 index 00000000..c24b64cd --- /dev/null +++ b/include/uapi/linux/rpl.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * IPv6 RPL-SR implementation + * + * Author: + * (C) 2020 Alexander Aring + */ + +#ifndef _LINUX_RPL_H +#define _LINUX_RPL_H + +#include +#include +#include + +/* + * RPL SR Header + */ +struct ipv6_rpl_sr_hdr { + __u8 nexthdr; + __u8 hdrlen; + __u8 type; + __u8 segments_left; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 cmpre:4, + cmpri:4, + reserved:4, + pad:4, + reserved1:16; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u32 reserved:20, + pad:4, + cmpri:4, + cmpre:4; +#else +#error "Please fix " +#endif + + union { + struct in6_addr addr[0]; + __u8 data[0]; + } segments; +} __attribute__((packed)); + +#define rpl_segaddr segments.addr +#define rpl_segdata segments.data + +#endif diff --git a/include/uapi/linux/rpl_iptunnel.h b/include/uapi/linux/rpl_iptunnel.h new file mode 100644 index 00000000..c255b92c --- /dev/null +++ b/include/uapi/linux/rpl_iptunnel.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * IPv6 RPL-SR implementation + * + * Author: + * (C) 2020 Alexander Aring + */ + +#ifndef _LINUX_RPL_IPTUNNEL_H +#define _LINUX_RPL_IPTUNNEL_H + +enum { + RPL_IPTUNNEL_UNSPEC, + RPL_IPTUNNEL_SRH, + __RPL_IPTUNNEL_MAX, +}; +#define RPL_IPTUNNEL_MAX (__RPL_IPTUNNEL_MAX - 1) + +#define RPL_IPTUNNEL_SRH_SIZE(srh) (((srh)->hdrlen + 1) << 3) + +#endif From ec04b6fc241b183e550b41831ecd1ff5d720f510 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 5 May 2020 10:04:57 -0700 Subject: [PATCH 21/33] devlink: support kernel-side snapshot id allocation Make ID argument optional and read the snapshot info that kernel sends us. $ devlink region new netdevsim/netdevsim1/dummy netdevsim/netdevsim1/dummy: snapshot 0 $ devlink -jp region new netdevsim/netdevsim1/dummy { "regions": { "netdevsim/netdevsim1/dummy": { "snapshot": [ 1 ] } } } $ devlink region show netdevsim/netdevsim1/dummy netdevsim/netdevsim1/dummy: size 32768 snapshot [0 1] Signed-off-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: David Ahern --- devlink/devlink.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/devlink/devlink.c b/devlink/devlink.c index bd48a73b..507972c3 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -6476,6 +6476,23 @@ static int cmd_region_read(struct dl *dl) return err; } +static int cmd_region_snapshot_new_cb(const struct nlmsghdr *nlh, void *data) +{ + struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh); + struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {}; + struct dl *dl = data; + + mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb); + if (!tb[DEVLINK_ATTR_BUS_NAME] || !tb[DEVLINK_ATTR_DEV_NAME] || + !tb[DEVLINK_ATTR_REGION_NAME] || + !tb[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) + return MNL_CB_ERROR; + + pr_out_region(dl, tb); + + return MNL_CB_OK; +} + static int cmd_region_snapshot_new(struct dl *dl) { struct nlmsghdr *nlh; @@ -6484,12 +6501,15 @@ static int cmd_region_snapshot_new(struct dl *dl) nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_REGION_NEW, NLM_F_REQUEST | NLM_F_ACK); - err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE_REGION | - DL_OPT_REGION_SNAPSHOT_ID, 0); + err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE_REGION, + DL_OPT_REGION_SNAPSHOT_ID); if (err) return err; - return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL); + pr_out_section_start(dl, "regions"); + err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_region_snapshot_new_cb, dl); + pr_out_section_end(dl); + return err; } static void cmd_region_help(void) From 0e9b227e2d9ea0874c2438c733e6d1e9de925563 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 13 May 2020 02:18:15 +0000 Subject: [PATCH 22/33] Update kernel headers and import tc_gate.h Update kernel headers to commit: fb9f2e92864f ("net: dsa: tag_sja1105: appease sparse checks for ethertype accessors") and import tc_act/tc_gate.h Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 2 +- include/uapi/linux/tc_act/tc_gate.h | 47 +++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/tc_act/tc_gate.h diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 518bea7c..dc5314dd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -73,7 +73,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[]; /* Arbitrary size */ + __u8 data[0]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { diff --git a/include/uapi/linux/tc_act/tc_gate.h b/include/uapi/linux/tc_act/tc_gate.h new file mode 100644 index 00000000..f214b3a6 --- /dev/null +++ b/include/uapi/linux/tc_act/tc_gate.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* Copyright 2020 NXP */ + +#ifndef __LINUX_TC_GATE_H +#define __LINUX_TC_GATE_H + +#include + +struct tc_gate { + tc_gen; +}; + +enum { + TCA_GATE_ENTRY_UNSPEC, + TCA_GATE_ENTRY_INDEX, + TCA_GATE_ENTRY_GATE, + TCA_GATE_ENTRY_INTERVAL, + TCA_GATE_ENTRY_IPV, + TCA_GATE_ENTRY_MAX_OCTETS, + __TCA_GATE_ENTRY_MAX, +}; +#define TCA_GATE_ENTRY_MAX (__TCA_GATE_ENTRY_MAX - 1) + +enum { + TCA_GATE_ONE_ENTRY_UNSPEC, + TCA_GATE_ONE_ENTRY, + __TCA_GATE_ONE_ENTRY_MAX, +}; +#define TCA_GATE_ONE_ENTRY_MAX (__TCA_GATE_ONE_ENTRY_MAX - 1) + +enum { + TCA_GATE_UNSPEC, + TCA_GATE_TM, + TCA_GATE_PARMS, + TCA_GATE_PAD, + TCA_GATE_PRIORITY, + TCA_GATE_ENTRY_LIST, + TCA_GATE_BASE_TIME, + TCA_GATE_CYCLE_TIME, + TCA_GATE_CYCLE_TIME_EXT, + TCA_GATE_FLAGS, + TCA_GATE_CLOCKID, + __TCA_GATE_MAX, +}; +#define TCA_GATE_MAX (__TCA_GATE_MAX - 1) + +#endif From 07d5ee70b5b3d1777ef6569bc5a94bf5d0ea5c0a Mon Sep 17 00:00:00 2001 From: Po Liu Date: Fri, 8 May 2020 15:02:46 +0800 Subject: [PATCH 23/33] iproute2-next:tc:action: add a gate control action Introduce a ingress frame gate control flow action. Tc gate action does the work like this: Assume there is a gate allow specified ingress frames can pass at specific time slot, and also drop at specific time slot. Tc filter chooses the ingress frames, and tc gate action would specify what slot does these frames can be passed to device and what time slot would be dropped. Tc gate action would provide an entry list to tell how much time gate keep open and how much time gate keep state close. Gate action also assign a start time to tell when the entry list start. Then driver would repeat the gate entry list cyclically. For the software simulation, gate action require the user assign a time clock type. Below is the setting example in user space. Tc filter a stream source ip address is 192.168.0.20 and gate action own two time slots. One is last 200ms gate open let frame pass another is last 100ms gate close let frames dropped. # tc qdisc add dev eth0 ingress # tc filter add dev eth0 parent ffff: protocol ip \ flower src_ip 192.168.0.20 \ action gate index 2 clockid CLOCK_TAI \ sched-entry open 200000000ns -1 8000000b \ sched-entry close 100000000ns # tc chain del dev eth0 ingress chain 0 "sched-entry" follow the name taprio style. Gate state is "open"/"close". Follow the period nanosecond. Then next -1 is internal priority value means which ingress queue should put to. "-1" means wildcard. The last value optional specifies the maximum number of MSDU octets that are permitted to pass the gate during the specified time interval, the overlimit frames would be dropped. Below example shows filtering a stream with destination mac address is 10:00:80:00:00:00 and ip type is ICMP, follow the action gate. The gate action would run with one close time slot which means always keep close. The time cycle is total 200000000ns. The base-time would calculate by: 1357000000000 + (N + 1) * cycletime When the total value is the future time, it will be the start time. The cycletime here would be 200000000ns for this case. #tc filter add dev eth0 parent ffff: protocol ip \ flower skip_hw ip_proto icmp dst_mac 10:00:80:00:00:00 \ action gate index 12 base-time 1357000000000ns \ sched-entry CLOSE 200000000ns \ clockid CLOCK_TAI Signed-off-by: Po Liu Signed-off-by: David Ahern --- tc/Makefile | 1 + tc/m_gate.c | 580 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 581 insertions(+) create mode 100644 tc/m_gate.c diff --git a/tc/Makefile b/tc/Makefile index e31cbc12..79c9c1dd 100644 --- a/tc/Makefile +++ b/tc/Makefile @@ -54,6 +54,7 @@ TCMODULES += m_bpf.o TCMODULES += m_tunnel_key.o TCMODULES += m_sample.o TCMODULES += m_ct.o +TCMODULES += m_gate.o TCMODULES += p_ip.o TCMODULES += p_ip6.o TCMODULES += p_icmp.o diff --git a/tc/m_gate.c b/tc/m_gate.c new file mode 100644 index 00000000..327df7eb --- /dev/null +++ b/tc/m_gate.c @@ -0,0 +1,580 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) +/* Copyright 2020 NXP */ + +#include +#include +#include +#include +#include +#include "utils.h" +#include "rt_names.h" +#include "tc_util.h" +#include "list.h" +#include + +struct gate_entry { + struct list_head list; + uint8_t gate_state; + uint32_t interval; + int32_t ipv; + int32_t maxoctets; +}; + +#define CLOCKID_INVALID (-1) +static const struct clockid_table { + const char *name; + clockid_t clockid; +} clockt_map[] = { + { "REALTIME", CLOCK_REALTIME }, + { "TAI", CLOCK_TAI }, + { "BOOTTIME", CLOCK_BOOTTIME }, + { "MONOTONIC", CLOCK_MONOTONIC }, + { NULL } +}; + +static void explain(void) +{ + fprintf(stderr, + "Usage: gate [ priority PRIO-SPEC ] [ base-time BASE-TIME ]\n" + " [ cycle-time CYCLE-TIME ]\n" + " [ cycle-time-ext CYCLE-TIME-EXT ]\n" + " [ clockid CLOCKID ] [flags FLAGS]\n" + " [ sched-entry GATE0 INTERVAL [ INTERNAL-PRIO-VALUE MAX-OCTETS ] ]\n" + " [ sched-entry GATE1 INTERVAL [ INTERNAL-PRIO-VALUE MAX-OCTETS ] ]\n" + " ......\n" + " [ sched-entry GATEn INTERVAL [ INTERNAL-PRIO-VALUE MAX-OCTETS ] ]\n" + " [ CONTROL ]\n" + " GATEn := open | close\n" + " INTERVAL : nanoseconds period of gate slot\n" + " INTERNAL-PRIO-VALUE : internal priority decide which\n" + " rx queue number direct to.\n" + " default to be -1 which means wildcard.\n" + " MAX-OCTETS : maximum number of MSDU octets that are\n" + " permitted to pas the gate during the\n" + " specified TimeInterval.\n" + " default to be -1 which means wildcard.\n" + " CONTROL := pipe | drop | continue | pass |\n" + " goto chain \n"); +} + +static void usage(void) +{ + explain(); + exit(-1); +} + +static void explain_entry_format(void) +{ + fprintf(stderr, "Usage: sched-entry [ ]\n"); +} + +static int parse_gate(struct action_util *a, int *argc_p, char ***argv_p, + int tca_id, struct nlmsghdr *n); +static int print_gate(struct action_util *au, FILE *f, struct rtattr *arg); + +struct action_util gate_action_util = { + .id = "gate", + .parse_aopt = parse_gate, + .print_aopt = print_gate, +}; + +static int get_clockid(__s32 *val, const char *arg) +{ + const struct clockid_table *c; + + if (strcasestr(arg, "CLOCK_") != NULL) + arg += sizeof("CLOCK_") - 1; + + for (c = clockt_map; c->name; c++) { + if (strcasecmp(c->name, arg) == 0) { + *val = c->clockid; + return 0; + } + } + + return -1; +} + +static const char *get_clock_name(clockid_t clockid) +{ + const struct clockid_table *c; + + for (c = clockt_map; c->name; c++) { + if (clockid == c->clockid) + return c->name; + } + + return "invalid"; +} + +static int get_gate_state(__u8 *val, const char *arg) +{ + if (!strcasecmp("OPEN", arg)) { + *val = 1; + return 0; + } + + if (!strcasecmp("CLOSE", arg)) { + *val = 0; + return 0; + } + + return -1; +} + +static struct gate_entry *create_gate_entry(uint8_t gate_state, + uint32_t interval, + int32_t ipv, + int32_t maxoctets) +{ + struct gate_entry *e; + + e = calloc(1, sizeof(*e)); + if (!e) + return NULL; + + e->gate_state = gate_state; + e->interval = interval; + e->ipv = ipv; + e->maxoctets = maxoctets; + + return e; +} + +static int add_gate_list(struct list_head *gate_entries, struct nlmsghdr *n) +{ + struct gate_entry *e; + + list_for_each_entry(e, gate_entries, list) { + struct rtattr *a; + + a = addattr_nest(n, 1024, TCA_GATE_ONE_ENTRY | NLA_F_NESTED); + + if (e->gate_state) + addattr(n, MAX_MSG, TCA_GATE_ENTRY_GATE); + + addattr_l(n, MAX_MSG, TCA_GATE_ENTRY_INTERVAL, + &e->interval, sizeof(e->interval)); + addattr_l(n, MAX_MSG, TCA_GATE_ENTRY_IPV, + &e->ipv, sizeof(e->ipv)); + addattr_l(n, MAX_MSG, TCA_GATE_ENTRY_MAX_OCTETS, + &e->maxoctets, sizeof(e->maxoctets)); + + addattr_nest_end(n, a); + } + + return 0; +} + +static void free_entries(struct list_head *gate_entries) +{ + struct gate_entry *e, *n; + + list_for_each_entry_safe(e, n, gate_entries, list) { + list_del(&e->list); + free(e); + } +} + +static int parse_gate(struct action_util *a, int *argc_p, char ***argv_p, + int tca_id, struct nlmsghdr *n) +{ + struct tc_gate parm = {.action = TC_ACT_PIPE}; + struct list_head gate_entries; + __s32 clockid = CLOCKID_INVALID; + struct rtattr *tail, *nle; + char **argv = *argv_p; + int argc = *argc_p; + __s64 base_time = 0; + __s64 cycle_time = 0; + __s64 cycle_time_ext = 0; + int entry_num = 0; + char *invalidarg; + __u32 flags = 0; + int prio = -1; + + int err; + + if (matches(*argv, "gate") != 0) + return -1; + + NEXT_ARG(); + if (argc <= 0) + return -1; + + INIT_LIST_HEAD(&gate_entries); + + while (argc > 0) { + if (matches(*argv, "index") == 0) { + NEXT_ARG(); + if (get_u32(&parm.index, *argv, 10)) { + invalidarg = "index"; + goto err_arg; + } + } else if (matches(*argv, "priority") == 0) { + NEXT_ARG(); + if (get_s32(&prio, *argv, 0)) { + invalidarg = "priority"; + goto err_arg; + } + } else if (matches(*argv, "base-time") == 0) { + NEXT_ARG(); + if (get_s64(&base_time, *argv, 10) && + get_time64(&base_time, *argv)) { + invalidarg = "base-time"; + goto err_arg; + } + } else if (matches(*argv, "cycle-time") == 0) { + NEXT_ARG(); + if (get_s64(&cycle_time, *argv, 10) && + get_time64(&cycle_time, *argv)) { + invalidarg = "cycle-time"; + goto err_arg; + } + } else if (matches(*argv, "cycle-time-ext") == 0) { + NEXT_ARG(); + if (get_s64(&cycle_time_ext, *argv, 10) && + get_time64(&cycle_time_ext, *argv)) { + invalidarg = "cycle-time-ext"; + goto err_arg; + } + } else if (matches(*argv, "clockid") == 0) { + NEXT_ARG(); + if (get_clockid(&clockid, *argv)) { + invalidarg = "clockid"; + goto err_arg; + } + } else if (matches(*argv, "flags") == 0) { + NEXT_ARG(); + if (get_u32(&flags, *argv, 0)) { + invalidarg = "flags"; + goto err_arg; + } + } else if (matches(*argv, "sched-entry") == 0) { + unsigned int maxoctets_uint = 0; + int32_t maxoctets = -1; + struct gate_entry *e; + uint8_t gate_state = 0; + __s64 interval_s64 = 0; + uint32_t interval = 0; + int32_t ipv = -1; + + if (!NEXT_ARG_OK()) { + explain_entry_format(); + fprintf(stderr, "\"sched-entry\" is imcomplete\n"); + free_entries(&gate_entries); + return -1; + } + + NEXT_ARG(); + + if (get_gate_state(&gate_state, *argv)) { + explain_entry_format(); + fprintf(stderr, "\"sched-entry\" is imcomplete\n"); + free_entries(&gate_entries); + return -1; + } + + if (!NEXT_ARG_OK()) { + explain_entry_format(); + fprintf(stderr, "\"sched-entry\" is imcomplete\n"); + free_entries(&gate_entries); + return -1; + } + + NEXT_ARG(); + + if (get_u32(&interval, *argv, 0) && + get_time64(&interval_s64, *argv)) { + explain_entry_format(); + fprintf(stderr, "\"sched-entry\" is imcomplete\n"); + free_entries(&gate_entries); + return -1; + } + + if (interval_s64 > UINT_MAX) { + fprintf(stderr, "\"interval\" is too large\n"); + free_entries(&gate_entries); + return -1; + } else if (interval_s64) { + interval = interval_s64; + } + + if (!NEXT_ARG_OK()) + goto create_entry; + + NEXT_ARG(); + + if (get_s32(&ipv, *argv, 0)) { + PREV_ARG(); + goto create_entry; + } + + if (!gate_state) + ipv = -1; + + if (!NEXT_ARG_OK()) + goto create_entry; + + NEXT_ARG(); + + if (get_s32(&maxoctets, *argv, 0) && + get_size(&maxoctets_uint, *argv)) + PREV_ARG(); + + if (maxoctets_uint > INT_MAX) { + fprintf(stderr, "\"maxoctets\" is too large\n"); + free_entries(&gate_entries); + return -1; + } else if (maxoctets_uint ) { + maxoctets = maxoctets_uint; + } + + if (!gate_state) + maxoctets = -1; + +create_entry: + e = create_gate_entry(gate_state, interval, + ipv, maxoctets); + if (!e) { + fprintf(stderr, "gate: not enough memory\n"); + free_entries(&gate_entries); + return -1; + } + + list_add_tail(&e->list, &gate_entries); + entry_num++; + } else if (matches(*argv, "help") == 0) { + usage(); + } else { + break; + } + + argc--; + argv++; + } + + parse_action_control_dflt(&argc, &argv, &parm.action, + false, TC_ACT_PIPE); + + if (!entry_num && !parm.index) { + fprintf(stderr, "gate: must add at least one entry\n"); + return -1; + } + + tail = addattr_nest(n, MAX_MSG, tca_id | NLA_F_NESTED); + addattr_l(n, MAX_MSG, TCA_GATE_PARMS, &parm, sizeof(parm)); + + if (prio != -1) + addattr_l(n, MAX_MSG, TCA_GATE_PRIORITY, &prio, sizeof(prio)); + + if (flags) + addattr_l(n, MAX_MSG, TCA_GATE_FLAGS, &flags, sizeof(flags)); + + if (base_time) + addattr_l(n, MAX_MSG, TCA_GATE_BASE_TIME, + &base_time, sizeof(base_time)); + + if (cycle_time) + addattr_l(n, MAX_MSG, TCA_GATE_CYCLE_TIME, + &cycle_time, sizeof(cycle_time)); + + if (cycle_time_ext) + addattr_l(n, MAX_MSG, TCA_GATE_CYCLE_TIME_EXT, + &cycle_time_ext, sizeof(cycle_time_ext)); + + if (clockid != CLOCKID_INVALID) + addattr_l(n, MAX_MSG, TCA_GATE_CLOCKID, + &clockid, sizeof(clockid)); + + nle = addattr_nest(n, MAX_MSG, TCA_GATE_ENTRY_LIST | NLA_F_NESTED); + err = add_gate_list(&gate_entries, n); + if (err < 0) { + fprintf(stderr, "Could not add entries to netlink message\n"); + free_entries(&gate_entries); + return -1; + } + + addattr_nest_end(n, nle); + addattr_nest_end(n, tail); + free_entries(&gate_entries); + *argc_p = argc; + *argv_p = argv; + + return 0; +err_arg: + invarg(invalidarg, *argv); + free_entries(&gate_entries); + + return -1; +} + +static int print_gate_list(struct rtattr *list) +{ + struct rtattr *item; + int rem; + + rem = RTA_PAYLOAD(list); + + print_string(PRINT_FP, NULL, "%s", _SL_); + print_string(PRINT_FP, NULL, "\tschedule:%s", _SL_); + open_json_array(PRINT_JSON, "schedule"); + + for (item = RTA_DATA(list); + RTA_OK(item, rem); + item = RTA_NEXT(item, rem)) { + struct rtattr *tb[TCA_GATE_ENTRY_MAX + 1]; + __u32 index = 0, interval = 0; + __u8 gate_state = 0; + __s32 ipv = -1, maxoctets = -1; + char buf[22]; + + parse_rtattr_nested(tb, TCA_GATE_ENTRY_MAX, item); + + if (tb[TCA_GATE_ENTRY_INDEX]) + index = rta_getattr_u32(tb[TCA_GATE_ENTRY_INDEX]); + + if (tb[TCA_GATE_ENTRY_GATE]) + gate_state = 1; + + if (tb[TCA_GATE_ENTRY_INTERVAL]) + interval = rta_getattr_u32(tb[TCA_GATE_ENTRY_INTERVAL]); + + if (tb[TCA_GATE_ENTRY_IPV]) + ipv = rta_getattr_s32(tb[TCA_GATE_ENTRY_IPV]); + + if (tb[TCA_GATE_ENTRY_MAX_OCTETS]) + maxoctets = rta_getattr_s32(tb[TCA_GATE_ENTRY_MAX_OCTETS]); + + open_json_object(NULL); + print_uint(PRINT_ANY, "number", "\t number %4u", index); + print_string(PRINT_ANY, "gate_state", "\tgate-state %s ", + gate_state ? "open" : "close"); + + print_uint(PRINT_JSON, "interval", NULL, interval); + + memset(buf, 0, sizeof(buf)); + print_string(PRINT_FP, NULL, "\tinterval %s", + sprint_time64(interval, buf)); + + if (ipv != -1) { + print_uint(PRINT_ANY, "ipv", "\t ipv %-10u", ipv); + } else { + print_int(PRINT_JSON, "ipv", NULL, ipv); + print_string(PRINT_FP, NULL, "\t ipv %s", "wildcard"); + } + + if (maxoctets != -1) { + memset(buf, 0, sizeof(buf)); + print_uint(PRINT_JSON, "max_octets", NULL, maxoctets); + print_string(PRINT_FP, NULL, "\t max-octets %s", + sprint_size(maxoctets, buf)); + } else { + print_string(PRINT_FP, NULL, + "\t max-octets %s", "wildcard"); + print_int(PRINT_JSON, "max_octets", NULL, maxoctets); + } + + close_json_object(); + print_string(PRINT_FP, NULL, "%s", _SL_); + } + + close_json_array(PRINT_ANY, ""); + + return 0; +} + +static int print_gate(struct action_util *au, FILE *f, struct rtattr *arg) +{ + struct tc_gate *parm; + struct rtattr *tb[TCA_GATE_MAX + 1]; + __s32 clockid = CLOCKID_INVALID; + __s64 base_time = 0; + __s64 cycle_time = 0; + __s64 cycle_time_ext = 0; + char buf[22]; + int prio = -1; + + if (arg == NULL) + return -1; + + parse_rtattr_nested(tb, TCA_GATE_MAX, arg); + + if (!tb[TCA_GATE_PARMS]) { + fprintf(stderr, "Missing gate parameters\n"); + return -1; + } + + print_string(PRINT_FP, NULL, "%s", "\n"); + + parm = RTA_DATA(tb[TCA_GATE_PARMS]); + + if (tb[TCA_GATE_PRIORITY]) + prio = rta_getattr_s32(tb[TCA_GATE_PRIORITY]); + + if (prio != -1) { + print_int(PRINT_ANY, "priority", "\tpriority %-8d", prio); + } else { + print_string(PRINT_FP, NULL, "\tpriority %s", "wildcard"); + print_int(PRINT_JSON, "priority", NULL, prio); + } + + if (tb[TCA_GATE_CLOCKID]) + clockid = rta_getattr_s32(tb[TCA_GATE_CLOCKID]); + print_string(PRINT_ANY, "clockid", "\tclockid %s", + get_clock_name(clockid)); + + if (tb[TCA_GATE_FLAGS]) { + __u32 flags; + + flags = rta_getattr_u32(tb[TCA_GATE_FLAGS]); + print_0xhex(PRINT_ANY, "flags", "\tflags %#x", flags); + } + + print_string(PRINT_FP, NULL, "%s", "\n"); + + if (tb[TCA_GATE_BASE_TIME]) + base_time = rta_getattr_s64(tb[TCA_GATE_BASE_TIME]); + + memset(buf, 0, sizeof(buf)); + print_string(PRINT_FP, NULL, "\tbase-time %s", + sprint_time64(base_time, buf)); + print_lluint(PRINT_JSON, "base_time", NULL, base_time); + + if (tb[TCA_GATE_CYCLE_TIME]) + cycle_time = rta_getattr_s64(tb[TCA_GATE_CYCLE_TIME]); + + memset(buf, 0, sizeof(buf)); + print_string(PRINT_FP, NULL, + "\tcycle-time %s", sprint_time64(cycle_time, buf)); + print_lluint(PRINT_JSON, "cycle_time", NULL, cycle_time); + + if (tb[TCA_GATE_CYCLE_TIME_EXT]) + cycle_time_ext = rta_getattr_s64(tb[TCA_GATE_CYCLE_TIME_EXT]); + + memset(buf, 0, sizeof(buf)); + print_string(PRINT_FP, NULL, "\tcycle-time-ext %s", + sprint_time64(cycle_time_ext, buf)); + print_lluint(PRINT_JSON, "cycle_time_ext", NULL, cycle_time_ext); + + if (tb[TCA_GATE_ENTRY_LIST]) + print_gate_list(tb[TCA_GATE_ENTRY_LIST]); + + print_action_control(f, "\t", parm->action, ""); + + print_uint(PRINT_ANY, "index", "\n\t index %u", parm->index); + print_int(PRINT_ANY, "ref", " ref %d", parm->refcnt); + print_int(PRINT_ANY, "bind", " bind %d", parm->bindcnt); + + if (show_stats) { + if (tb[TCA_GATE_TM]) { + struct tcf_t *tm = RTA_DATA(tb[TCA_GATE_TM]); + + print_tm(f, tm); + } + } + + print_string(PRINT_FP, NULL, "%s", "\n"); + + return 0; +} From 965a5f6a1b394d6ab791be76550e650cad985ef0 Mon Sep 17 00:00:00 2001 From: Po Liu Date: Fri, 8 May 2020 15:02:47 +0800 Subject: [PATCH 24/33] iproute2-next: add gate action man page This patch is to add the man page for the tc gate action. Signed-off-by: Po Liu Signed-off-by: David Ahern --- man/man8/tc-gate.8 | 123 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 man/man8/tc-gate.8 diff --git a/man/man8/tc-gate.8 b/man/man8/tc-gate.8 new file mode 100644 index 00000000..23d93ca4 --- /dev/null +++ b/man/man8/tc-gate.8 @@ -0,0 +1,123 @@ +.TH GATE 8 "12 Mar 2020" "iproute2" "Linux" +.SH NAME +gate \- Stream Gate Action +.SH SYNOPSIS +.B tc " ... " action gate +.ti +8 +.B [ base-time +BASETIME ] +.B [ clockid +CLOCKID ] +.ti +8 +.B sched-entry + [ ] +.ti +8 +.B sched-entry + [ ] +.ti +8 +.B sched-entry + [ ] +.ti +8 +.B ...... +.ti +8 +.B sched-entry + [ ] + +.SH DESCRIPTION +GATE action allows specified ingress frames can be passed at +specific time slot, or be dropped at specific time slot. Tc filter +filters the ingress frames, then tc gate action would specify which time +slot and how many bytes these frames can be passed to device and +which time slot frames would be dropped. +Gate action also assign a base-time to tell when the entry list start. +Then gate action would start to repeat the gate entry list cyclically +at the start base-time. +For the software simulation, gate action requires the user assign reference +time clock type. + +.SH PARAMETERS + +.TP +base-time +.br +Specifies the instant in nanoseconds, defining the time when the schedule +starts. If 'base-time' is a time in the past, the schedule will start at + +base-time + (N * cycle-time) + +where N is the smallest integer so the resulting time is greater than +"now", and "cycle-time" is the sum of all the intervals of the entries +in the schedule. Without base-time specified, will default to be 0. + +.TP +clockid +.br +Specifies the clock to be used by qdisc's internal timer for measuring +time and scheduling events. Not valid if gate action is used for offloading +filter. +For example, tc filter command with +.B skip_sw +parameter. + +.TP +sched-entry +.br +There may multiple +.B sched-entry +parameters in a single schedule. Each one has the format: + +sched-entry [ ] + +.br + means gate states. 'open' keep gate open, 'close' keep gate close. +.br + means how much nano seconds for this time slot. +.br + means internal priority value. Present of the +internal receiving queue for this stream. "-1" means wildcard. + and can be omit default to be "-1" which both + value to be "-1" for this . +.br + means how many octets size could pass in this time slot. Dropped +if overlimited. "-1" means wildcard. can be omit default to be +"-1" which value to be "-1" for this . +.br +Note that and are nothing meaning for gate state +is "close" in a "sched-entry". All frames are dropped when "sched-entry" with +"close" state. + +.SH EXAMPLES + +The following example shows tc filter frames source ip match to the +192.168.0.20 will keep the gate open for 200ms and limit the traffic to 8MB +in this sched-entry. Then keep the traffic gate to be close for 100ms. +Frames arrived at gate close state would be dropped. Then the cycle would +run the gate entries periodically. The schedule will start at instant 200.0s +using the reference CLOCK_TAI. The schedule is composed of two entries +each of 300ms duration. + +.EX +# tc qdisc add dev eth0 ingress +# tc filter add dev eth0 parent ffff: protocol ip \\ + flower skip_hw src_ip 192.168.0.20 \\ + action gate index 2 clockid CLOCK_TAI \\ + base-time 200000000000ns \\ + sched-entry open 200000000ns -1 8000000b \\ + sched-entry close 100000000ns + +.EE + +Following commands is an example to filter a stream source mac match to the +10:00:80:00:00:00 icmp frames will be dropped at any time with cycle 200ms. +With a default basetime 0 and clockid is CLOCK_TAI as default. + +.EX +# tc qdisc add dev eth0 ingress +# tc filter add dev eth0 parent ffff: protocol ip \\ + flower ip_proto icmp dst_mac 10:00:80:00:00:00 \\ + action gate index 12 sched-entry close 200000000ns + +.EE + +.SH AUTHORS +Po Liu From d5e6ee0dac64b64ee0f4a8ed3badf8858ec79edc Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Sat, 9 May 2020 19:52:00 +0300 Subject: [PATCH 25/33] ss: introduce cgroup2 cache and helper functions This patch prepares infrastructure for matching sockets by cgroups. Two helper functions are added for transformation between cgroup v2 ID and pathname. Cgroup v2 cache is implemented as hash table indexed by ID. This cache is needed for faster lookups of socket cgroup. v2: - style fixes (David Ahern) Signed-off-by: Dmitry Yakunin Signed-off-by: David Ahern --- include/cg_map.h | 6 +++ include/utils.h | 4 +- ip/ipvrf.c | 4 +- lib/Makefile | 2 +- lib/cg_map.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++ lib/fs.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++- 6 files changed, 282 insertions(+), 6 deletions(-) create mode 100644 include/cg_map.h create mode 100644 lib/cg_map.c diff --git a/include/cg_map.h b/include/cg_map.h new file mode 100644 index 00000000..d30517fd --- /dev/null +++ b/include/cg_map.h @@ -0,0 +1,6 @@ +#ifndef __CG_MAP_H__ +#define __CG_MAP_H__ + +const char *cg_id_to_path(__u64 id); + +#endif /* __CG_MAP_H__ */ diff --git a/include/utils.h b/include/utils.h index 001491a1..7041c461 100644 --- a/include/utils.h +++ b/include/utils.h @@ -302,7 +302,9 @@ int get_real_family(int rtm_type, int rtm_family); int cmd_exec(const char *cmd, char **argv, bool do_fork, int (*setup)(void *), void *arg); int make_path(const char *path, mode_t mode); -char *find_cgroup2_mount(void); +char *find_cgroup2_mount(bool do_mount); +__u64 get_cgroup2_id(const char *path); +char *get_cgroup2_path(__u64 id, bool full); int get_command_name(const char *pid, char *comm, size_t len); int get_rtnl_link_stats_rta(struct rtnl_link_stats64 *stats64, diff --git a/ip/ipvrf.c b/ip/ipvrf.c index b9a43675..28dd8e25 100644 --- a/ip/ipvrf.c +++ b/ip/ipvrf.c @@ -225,7 +225,7 @@ static int ipvrf_pids(int argc, char **argv) return -1; } - mnt = find_cgroup2_mount(); + mnt = find_cgroup2_mount(true); if (!mnt) return -1; @@ -366,7 +366,7 @@ static int vrf_switch(const char *name) } } - mnt = find_cgroup2_mount(); + mnt = find_cgroup2_mount(true); if (!mnt) return -1; diff --git a/lib/Makefile b/lib/Makefile index bab8cbf5..7cba1857 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -5,7 +5,7 @@ CFLAGS += -fPIC UTILOBJ = utils.o rt_names.o ll_map.o ll_types.o ll_proto.o ll_addr.o \ inet_proto.o namespace.o json_writer.o json_print.o \ - names.o color.o bpf.o exec.o fs.o + names.o color.o bpf.o exec.o fs.o cg_map.o NLOBJ=libgenl.o libnetlink.o diff --git a/lib/cg_map.c b/lib/cg_map.c new file mode 100644 index 00000000..77f030e3 --- /dev/null +++ b/lib/cg_map.c @@ -0,0 +1,135 @@ +/* + * cg_map.c cgroup v2 cache + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Dmitry Yakunin + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cg_map.h" +#include "list.h" +#include "utils.h" + +struct cg_cache { + struct hlist_node id_hash; + __u64 id; + char path[]; +}; + +#define IDMAP_SIZE 1024 +static struct hlist_head id_head[IDMAP_SIZE]; + +static struct cg_cache *cg_get_by_id(__u64 id) +{ + unsigned int h = id & (IDMAP_SIZE - 1); + struct hlist_node *n; + + hlist_for_each(n, &id_head[h]) { + struct cg_cache *cg; + + cg = container_of(n, struct cg_cache, id_hash); + if (cg->id == id) + return cg; + } + + return NULL; +} + +static struct cg_cache *cg_entry_create(__u64 id, const char *path) +{ + unsigned int h = id & (IDMAP_SIZE - 1); + struct cg_cache *cg; + + cg = malloc(sizeof(*cg) + strlen(path) + 1); + if (!cg) { + fprintf(stderr, + "Failed to allocate memory for cgroup2 cache entry"); + return NULL; + } + cg->id = id; + strcpy(cg->path, path); + + hlist_add_head(&cg->id_hash, &id_head[h]); + + return cg; +} + +static int mntlen; + +static int nftw_fn(const char *fpath, const struct stat *sb, + int typeflag, struct FTW *ftw) +{ + const char *path; + __u64 id; + + if (typeflag != FTW_D) + return 0; + + id = get_cgroup2_id(fpath); + if (!id) + return -1; + + path = fpath + mntlen; + if (*path == '\0') + /* root cgroup */ + path = "/"; + if (!cg_entry_create(id, path)) + return -1; + + return 0; +} + +static void cg_init_map(void) +{ + char *mnt; + + mnt = find_cgroup2_mount(false); + if (!mnt) + exit(1); + + mntlen = strlen(mnt); + if (nftw(mnt, nftw_fn, 1024, FTW_MOUNT) < 0) + exit(1); + + free(mnt); +} + +const char *cg_id_to_path(__u64 id) +{ + static int initialized; + static char buf[64]; + + const struct cg_cache *cg; + char *path; + + if (!initialized) { + cg_init_map(); + initialized = 1; + } + + cg = cg_get_by_id(id); + if (cg) + return cg->path; + + path = get_cgroup2_path(id, false); + if (path) { + cg = cg_entry_create(id, path); + free(path); + if (cg) + return cg->path; + } + + snprintf(buf, sizeof(buf), "unreachable:%llx", id); + return buf; +} diff --git a/lib/fs.c b/lib/fs.c index 86efd4ed..e265fc04 100644 --- a/lib/fs.c +++ b/lib/fs.c @@ -59,13 +59,18 @@ static char *find_fs_mount(const char *fs_to_find) } /* caller needs to free string returned */ -char *find_cgroup2_mount(void) +char *find_cgroup2_mount(bool do_mount) { char *mnt = find_fs_mount(CGROUP2_FS_NAME); if (mnt) return mnt; + if (!do_mount) { + fprintf(stderr, "Failed to find cgroup2 mount\n"); + return NULL; + } + mnt = strdup(MNT_CGRP2_PATH); if (!mnt) { fprintf(stderr, "Failed to allocate memory for cgroup2 path\n"); @@ -74,7 +79,7 @@ char *find_cgroup2_mount(void) } if (make_path(mnt, 0755)) { - fprintf(stderr, "Failed to setup vrf cgroup2 directory\n"); + fprintf(stderr, "Failed to setup cgroup2 directory\n"); free(mnt); return NULL; } @@ -99,6 +104,134 @@ out: return mnt; } +__u64 get_cgroup2_id(const char *path) +{ + char fh_buf[sizeof(struct file_handle) + sizeof(__u64)] = { 0 }; + struct file_handle *fhp = (struct file_handle *)fh_buf; + union { + __u64 id; + unsigned char bytes[sizeof(__u64)]; + } cg_id = { .id = 0 }; + char *mnt = NULL; + int mnt_fd = -1; + int mnt_id; + + if (!path) { + fprintf(stderr, "Invalid cgroup2 path\n"); + return 0; + } + + fhp->handle_bytes = sizeof(__u64); + if (name_to_handle_at(AT_FDCWD, path, fhp, &mnt_id, 0) < 0) { + /* try at cgroup2 mount */ + + while (*path == '/') + path++; + if (*path == '\0') { + fprintf(stderr, "Invalid cgroup2 path\n"); + goto out; + } + + mnt = find_cgroup2_mount(false); + if (!mnt) + goto out; + + mnt_fd = open(mnt, O_RDONLY); + if (mnt_fd < 0) { + fprintf(stderr, "Failed to open cgroup2 mount\n"); + goto out; + } + + fhp->handle_bytes = sizeof(__u64); + if (name_to_handle_at(mnt_fd, path, fhp, &mnt_id, 0) < 0) { + fprintf(stderr, "Failed to get cgroup2 ID: %s\n", + strerror(errno)); + goto out; + } + if (fhp->handle_bytes != sizeof(__u64)) { + fprintf(stderr, "Invalid size of cgroup2 ID\n"); + goto out; + } + } + + memcpy(cg_id.bytes, fhp->f_handle, sizeof(__u64)); + +out: + close(mnt_fd); + free(mnt); + + return cg_id.id; +} + +#define FILEID_INO32_GEN 1 + +/* caller needs to free string returned */ +char *get_cgroup2_path(__u64 id, bool full) +{ + char fh_buf[sizeof(struct file_handle) + sizeof(__u64)] = { 0 }; + struct file_handle *fhp = (struct file_handle *)fh_buf; + union { + __u64 id; + unsigned char bytes[sizeof(__u64)]; + } cg_id = { .id = id }; + int mnt_fd = -1, fd = -1; + char link_buf[PATH_MAX]; + char *path = NULL; + char fd_path[64]; + int link_len; + char *mnt; + + if (!id) { + fprintf(stderr, "Invalid cgroup2 ID\n"); + return NULL; + } + + mnt = find_cgroup2_mount(false); + if (!mnt) + return NULL; + + mnt_fd = open(mnt, O_RDONLY); + if (mnt_fd < 0) { + fprintf(stderr, "Failed to open cgroup2 mount\n"); + goto out; + } + + fhp->handle_bytes = sizeof(__u64); + fhp->handle_type = FILEID_INO32_GEN; + memcpy(fhp->f_handle, cg_id.bytes, sizeof(__u64)); + + fd = open_by_handle_at(mnt_fd, fhp, 0); + if (fd < 0) { + fprintf(stderr, "Failed to open cgroup2 by ID\n"); + goto out; + } + + snprintf(fd_path, sizeof(fd_path), "/proc/self/fd/%d", fd); + link_len = readlink(fd_path, link_buf, sizeof(link_buf) - 1); + if (link_len < 0) { + fprintf(stderr, + "Failed to read value of symbolic link %s\n", + fd_path); + goto out; + } + link_buf[link_len] = '\0'; + + if (full) + path = strdup(link_buf); + else + path = strdup(link_buf + strlen(mnt)); + if (!path) + fprintf(stderr, + "Failed to allocate memory for cgroup2 path\n"); + +out: + close(fd); + close(mnt_fd); + free(mnt); + + return path; +} + int make_path(const char *path, mode_t mode) { char *dir, *delim; From 14f4bda590447179e2b338c6e494658d2f737e6e Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Sat, 9 May 2020 19:52:01 +0300 Subject: [PATCH 26/33] ss: add support for cgroup v2 information and filtering This patch introduces two new features: obtaining cgroup information and filtering sockets by cgroups. These features work based on cgroup v2 ID field in the socket (kernel should be compiled with CONFIG_SOCK_CGROUP_DATA). Cgroup information can be obtained by specifying --cgroup flag and now contains only pathname. For faster pathname lookups cgroup cache is implemented. This cache is filled on ss startup and missed entries are resolved and saved on the fly. Cgroup filter extends EXPRESSION and allows to specify cgroup pathname (relative or absolute) to obtain sockets attached only to this cgroup. Filter syntax: ss [ cgroup PATHNAME ] Examples: ss -a cgroup /sys/fs/cgroup/unified (or ss -a cgroup .) ss -a cgroup /sys/fs/cgroup/unified/cgroup1 (or ss -a cgroup cgroup1) v2: - style fixes (David Ahern) Signed-off-by: Dmitry Yakunin Signed-off-by: David Ahern --- man/man8/ss.8 | 9 ++++++++ misc/ss.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++ misc/ssfilter.h | 2 ++ misc/ssfilter.y | 22 +++++++++++++++++- 4 files changed, 93 insertions(+), 1 deletion(-) diff --git a/man/man8/ss.8 b/man/man8/ss.8 index c80853f9..3b2559ff 100644 --- a/man/man8/ss.8 +++ b/man/man8/ss.8 @@ -286,6 +286,15 @@ Class id set by net_cls cgroup. If class is zero this shows priority set by SO_PRIORITY. .RE .TP +.B \-\-cgroup +Show cgroup information. Below fields may appear: +.RS +.P +.TP +.B cgroup +Cgroup v2 pathname. This pathname is relative to the mount point of the hierarchy. +.RE +.TP .B \-K, \-\-kill Attempts to forcibly close sockets. This option displays sockets that are successfully closed and silently skips sockets that the kernel does not support diff --git a/misc/ss.c b/misc/ss.c index ee840149..2a71317d 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -36,6 +36,7 @@ #include "namespace.h" #include "SNAPSHOT.h" #include "rt_names.h" +#include "cg_map.h" #include #include @@ -123,6 +124,7 @@ static int follow_events; static int sctp_ino; static int show_tipcinfo; static int show_tos; +static int show_cgroup; int oneline; enum col_id { @@ -798,6 +800,7 @@ struct sockstat { char *name; char *peer_name; __u32 mark; + __u64 cgroup_id; }; struct dctcpstat { @@ -1418,6 +1421,9 @@ static void sock_details_print(struct sockstat *s) if (s->mark) out(" fwmark:0x%x", s->mark); + + if (s->cgroup_id) + out(" cgroup:%s", cg_id_to_path(s->cgroup_id)); } static void sock_addr_print(const char *addr, char *delim, const char *port, @@ -1644,6 +1650,7 @@ struct aafilter { unsigned int iface; __u32 mark; __u32 mask; + __u64 cgroup_id; struct aafilter *next; }; @@ -1771,6 +1778,12 @@ static int run_ssfilter(struct ssfilter *f, struct sockstat *s) struct aafilter *a = (void *)f->pred; return (s->mark & a->mask) == a->mark; + } + case SSF_CGROUPCOND: + { + struct aafilter *a = (void *)f->pred; + + return s->cgroup_id == a->cgroup_id; } /* Yup. It is recursion. Sorry. */ case SSF_AND: @@ -1962,6 +1975,23 @@ static int ssfilter_bytecompile(struct ssfilter *f, char **bytecode) { a->mark, a->mask}, }; + return inslen; + } + case SSF_CGROUPCOND: + { + struct aafilter *a = (void *)f->pred; + struct instr { + struct inet_diag_bc_op op; + __u64 cgroup_id; + } __attribute__((packed)); + int inslen = sizeof(struct instr); + + if (!(*bytecode = malloc(inslen))) abort(); + ((struct instr *)*bytecode)[0] = (struct instr) { + { INET_DIAG_BC_CGROUP_COND, inslen, inslen + 4 }, + a->cgroup_id, + }; + return inslen; } default: @@ -2301,6 +2331,22 @@ void *parse_markmask(const char *markmask) return res; } +void *parse_cgroupcond(const char *path) +{ + struct aafilter *res; + __u64 id; + + id = get_cgroup2_id(path); + if (!id) + return NULL; + + res = malloc(sizeof(*res)); + if (res) + res->cgroup_id = id; + + return res; +} + static void proc_ctx_print(struct sockstat *s) { char *buf; @@ -3157,6 +3203,9 @@ static void parse_diag_msg(struct nlmsghdr *nlh, struct sockstat *s) s->mark = 0; if (tb[INET_DIAG_MARK]) s->mark = rta_getattr_u32(tb[INET_DIAG_MARK]); + s->cgroup_id = 0; + if (tb[INET_DIAG_CGROUP_ID]) + s->cgroup_id = rta_getattr_u64(tb[INET_DIAG_CGROUP_ID]); if (tb[INET_DIAG_PROTOCOL]) s->raw_prot = rta_getattr_u8(tb[INET_DIAG_PROTOCOL]); else @@ -3224,6 +3273,11 @@ static int inet_show_sock(struct nlmsghdr *nlh, out(" class_id:%#x", rta_getattr_u32(tb[INET_DIAG_CLASS_ID])); } + if (show_cgroup) { + if (tb[INET_DIAG_CGROUP_ID]) + out(" cgroup:%s", cg_id_to_path(rta_getattr_u64(tb[INET_DIAG_CGROUP_ID]))); + } + if (show_mem || (show_tcpinfo && s->type != IPPROTO_UDP)) { if (!oneline) out("\n\t"); @@ -5049,6 +5103,7 @@ static void _usage(FILE *dest) " --tipcinfo show internal tipc socket information\n" " -s, --summary show socket usage summary\n" " --tos show tos and priority information\n" +" --cgroup show cgroup information\n" " -b, --bpf show bpf filter socket information\n" " -E, --events continually display sockets as they are destroyed\n" " -Z, --context display process SELinux security contexts\n" @@ -5159,6 +5214,8 @@ static int scan_state(const char *state) /* Values of 'x' are already used so a non-character is used */ #define OPT_XDPSOCK 260 +#define OPT_CGROUP 261 + static const struct option long_opts[] = { { "numeric", 0, 0, 'n' }, { "resolve", 0, 0, 'r' }, @@ -5195,6 +5252,7 @@ static const struct option long_opts[] = { { "net", 1, 0, 'N' }, { "tipcinfo", 0, 0, OPT_TIPCINFO}, { "tos", 0, 0, OPT_TOS }, + { "cgroup", 0, 0, OPT_CGROUP }, { "kill", 0, 0, 'K' }, { "no-header", 0, 0, 'H' }, { "xdp", 0, 0, OPT_XDPSOCK}, @@ -5382,6 +5440,9 @@ int main(int argc, char *argv[]) case OPT_TOS: show_tos = 1; break; + case OPT_CGROUP: + show_cgroup = 1; + break; case 'K': current_filter.kill = 1; break; diff --git a/misc/ssfilter.h b/misc/ssfilter.h index f5b0bc8a..d85c084e 100644 --- a/misc/ssfilter.h +++ b/misc/ssfilter.h @@ -11,6 +11,7 @@ #define SSF_S_AUTO 9 #define SSF_DEVCOND 10 #define SSF_MARKMASK 11 +#define SSF_CGROUPCOND 12 #include @@ -25,3 +26,4 @@ int ssfilter_parse(struct ssfilter **f, int argc, char **argv, FILE *fp); void *parse_hostcond(char *addr, bool is_port); void *parse_devcond(char *name); void *parse_markmask(const char *markmask); +void *parse_cgroupcond(const char *path); diff --git a/misc/ssfilter.y b/misc/ssfilter.y index a901ae75..b4175795 100644 --- a/misc/ssfilter.y +++ b/misc/ssfilter.y @@ -36,7 +36,7 @@ static void yyerror(char *s) %} -%token HOSTCOND DCOND SCOND DPORT SPORT LEQ GEQ NEQ AUTOBOUND DEVCOND DEVNAME MARKMASK FWMARK +%token HOSTCOND DCOND SCOND DPORT SPORT LEQ GEQ NEQ AUTOBOUND DEVCOND DEVNAME MARKMASK FWMARK CGROUPCOND CGROUPPATH %left '|' %left '&' %nonassoc '!' @@ -156,6 +156,14 @@ expr: '(' exprlist ')' { $$ = alloc_node(SSF_NOT, alloc_node(SSF_MARKMASK, $3)); } + | CGROUPPATH eq CGROUPCOND + { + $$ = alloc_node(SSF_CGROUPCOND, $3); + } + | CGROUPPATH NEQ CGROUPCOND + { + $$ = alloc_node(SSF_NOT, alloc_node(SSF_CGROUPCOND, $3)); + } | AUTOBOUND { $$ = alloc_node(SSF_S_AUTO, NULL); @@ -276,6 +284,10 @@ int yylex(void) tok_type = FWMARK; return FWMARK; } + if (strcmp(curtok, "cgroup") == 0) { + tok_type = CGROUPPATH; + return CGROUPPATH; + } if (strcmp(curtok, ">=") == 0 || strcmp(curtok, "ge") == 0 || strcmp(curtok, "geq") == 0) @@ -318,6 +330,14 @@ int yylex(void) } return MARKMASK; } + if (tok_type == CGROUPPATH) { + yylval = (void*)parse_cgroupcond(curtok); + if (yylval == NULL) { + fprintf(stderr, "Cannot parse cgroup %s.\n", curtok); + exit(1); + } + return CGROUPCOND; + } yylval = (void*)parse_hostcond(curtok, tok_type == SPORT || tok_type == DPORT); if (yylval == NULL) { fprintf(stderr, "Cannot parse dst/src address.\n"); From 7bd9188581aa8abd644d45a990529a92efe50255 Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Sat, 9 May 2020 19:52:02 +0300 Subject: [PATCH 27/33] ss: add checks for bc filter support As noted by David Ahern, now if some bytecode filter is not supported by running kernel printed error message is not clear. This patch is attempt to detect such case and print correct message. This is done by providing checking function for new filter types. As example check function for cgroup filter is implemented. It sends correct lightweight request (idiag_states = 0) with zero cgroup condition to the kernel and checks returned errno. If filter is not supported EINVAL is returned. Result of checking is cached to avoid extra checks if several same filters are specified. Signed-off-by: Dmitry Yakunin Signed-off-by: David Ahern --- misc/Makefile | 2 +- misc/ss.c | 17 +------ misc/ss_util.h | 22 +++++++++ misc/ssfilter.h | 34 ++++++++------ misc/ssfilter.y | 9 +++- misc/ssfilter_check.c | 103 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 154 insertions(+), 33 deletions(-) create mode 100644 misc/ss_util.h create mode 100644 misc/ssfilter_check.c diff --git a/misc/Makefile b/misc/Makefile index 1debfb15..50dae79c 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -SSOBJ=ss.o ssfilter.tab.o +SSOBJ=ss.o ssfilter_check.o ssfilter.tab.o LNSTATOBJ=lnstat.o lnstat_util.o TARGETS=ss nstat ifstat rtacct lnstat diff --git a/misc/ss.c b/misc/ss.c index 2a71317d..71224218 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -29,6 +29,7 @@ #include #include +#include "ss_util.h" #include "utils.h" #include "rt_names.h" #include "ll_map.h" @@ -39,8 +40,6 @@ #include "cg_map.h" #include -#include -#include #include #include /* for MAX_ADDR_LEN */ #include @@ -64,24 +63,10 @@ #define AF_VSOCK PF_VSOCK #endif -#define MAGIC_SEQ 123456 #define BUF_CHUNK (1024 * 1024) /* Buffer chunk allocation size */ #define BUF_CHUNKS_MAX 5 /* Maximum number of allocated buffer chunks */ #define LEN_ALIGN(x) (((x) + 1) & ~1) -#define DIAG_REQUEST(_req, _r) \ - struct { \ - struct nlmsghdr nlh; \ - _r; \ - } _req = { \ - .nlh = { \ - .nlmsg_type = SOCK_DIAG_BY_FAMILY, \ - .nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST,\ - .nlmsg_seq = MAGIC_SEQ, \ - .nlmsg_len = sizeof(_req), \ - }, \ - } - #if HAVE_SELINUX #include #else diff --git a/misc/ss_util.h b/misc/ss_util.h new file mode 100644 index 00000000..f7e40bb9 --- /dev/null +++ b/misc/ss_util.h @@ -0,0 +1,22 @@ +#ifndef __SS_UTIL_H__ +#define __SS_UTIL_H__ + +#include +#include + +#define MAGIC_SEQ 123456 + +#define DIAG_REQUEST(_req, _r) \ + struct { \ + struct nlmsghdr nlh; \ + _r; \ + } _req = { \ + .nlh = { \ + .nlmsg_type = SOCK_DIAG_BY_FAMILY, \ + .nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST,\ + .nlmsg_seq = MAGIC_SEQ, \ + .nlmsg_len = sizeof(_req), \ + }, \ + } + +#endif /* __SS_UTIL_H__ */ diff --git a/misc/ssfilter.h b/misc/ssfilter.h index d85c084e..0be3b1e0 100644 --- a/misc/ssfilter.h +++ b/misc/ssfilter.h @@ -1,20 +1,24 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#define SSF_DCOND 0 -#define SSF_SCOND 1 -#define SSF_OR 2 -#define SSF_AND 3 -#define SSF_NOT 4 -#define SSF_D_GE 5 -#define SSF_D_LE 6 -#define SSF_S_GE 7 -#define SSF_S_LE 8 -#define SSF_S_AUTO 9 -#define SSF_DEVCOND 10 -#define SSF_MARKMASK 11 -#define SSF_CGROUPCOND 12 - #include +enum { + SSF_DCOND, + SSF_SCOND, + SSF_OR, + SSF_AND, + SSF_NOT, + SSF_D_GE, + SSF_D_LE, + SSF_S_GE, + SSF_S_LE, + SSF_S_AUTO, + SSF_DEVCOND, + SSF_MARKMASK, + SSF_CGROUPCOND, + SSF__MAX +}; + +bool ssfilter_is_supported(int type); + struct ssfilter { int type; diff --git a/misc/ssfilter.y b/misc/ssfilter.y index b4175795..8e16b446 100644 --- a/misc/ssfilter.y +++ b/misc/ssfilter.y @@ -12,7 +12,14 @@ typedef struct ssfilter * ssfilter_t; static struct ssfilter * alloc_node(int type, void *pred) { - struct ssfilter *n = malloc(sizeof(*n)); + struct ssfilter *n; + + if (!ssfilter_is_supported(type)) { + fprintf(stderr, "It looks like such filter is not supported! Too old kernel?\n"); + exit(-1); + } + + n = malloc(sizeof(*n)); if (n == NULL) abort(); n->type = type; diff --git a/misc/ssfilter_check.c b/misc/ssfilter_check.c new file mode 100644 index 00000000..38c960c1 --- /dev/null +++ b/misc/ssfilter_check.c @@ -0,0 +1,103 @@ +#include +#include +#include + +#include "libnetlink.h" +#include "ssfilter.h" +#include "ss_util.h" + +static int dummy_filter(struct nlmsghdr *n, void *arg) +{ + /* just stops rtnl_dump_filter() */ + return -1; +} + +static bool cgroup_filter_check(void) +{ + struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; + DIAG_REQUEST(req, struct inet_diag_req_v2 r); + struct instr { + struct inet_diag_bc_op op; + __u64 cgroup_id; + } __attribute__((packed)); + int inslen = sizeof(struct instr); + struct instr instr = { + { INET_DIAG_BC_CGROUP_COND, inslen, inslen + 4 }, + 0 + }; + struct rtnl_handle rth; + struct iovec iov[3]; + struct msghdr msg; + struct rtattr rta; + int ret = false; + int iovlen = 3; + + if (rtnl_open_byproto(&rth, 0, NETLINK_SOCK_DIAG)) + return false; + rth.dump = MAGIC_SEQ; + rth.flags = RTNL_HANDLE_F_SUPPRESS_NLERR; + + memset(&req.r, 0, sizeof(req.r)); + req.r.sdiag_family = AF_INET; + req.r.sdiag_protocol = IPPROTO_TCP; + req.nlh.nlmsg_len += RTA_LENGTH(inslen); + + rta.rta_type = INET_DIAG_REQ_BYTECODE; + rta.rta_len = RTA_LENGTH(inslen); + + iov[0] = (struct iovec) { &req, sizeof(req) }; + iov[1] = (struct iovec) { &rta, sizeof(rta) }; + iov[2] = (struct iovec) { &instr, inslen }; + + msg = (struct msghdr) { + .msg_name = (void *)&nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = iov, + .msg_iovlen = iovlen, + }; + + if (sendmsg(rth.fd, &msg, 0) < 0) + goto out; + + if (rtnl_dump_filter(&rth, dummy_filter, NULL) < 0) { + ret = (errno != EINVAL); + goto out; + } + + ret = true; + +out: + rtnl_close(&rth); + + return ret; +} + + +struct filter_check_t { + bool (*check)(void); + int checked:1, + supported:1; +}; + +static struct filter_check_t filter_checks[SSF__MAX] = { + [SSF_CGROUPCOND] = { cgroup_filter_check, 0 }, +}; + +bool ssfilter_is_supported(int type) +{ + struct filter_check_t f; + + if (type >= SSF__MAX) + return false; + + f = filter_checks[type]; + if (!f.check) + return true; + + if (!f.checked) { + f.supported = f.check(); + f.checked = 1; + } + + return f.supported; +} From 42796dcd36772f2a0116bf8a1e94fe43733c0716 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 13 May 2020 21:47:17 +0200 Subject: [PATCH 28/33] tc: mqprio: reject queues count/offset pair count higher than num_tc Provide a sanity check that will make sure whether queues count/offset pair count will not exceed the actual number of TCs being created. Example command that is invalid because there are 4 count/offset pairs whereas num_tc is only 2. # tc qdisc add dev enp96s0f0 root mqprio num_tc 2 map 0 0 0 0 1 1 1 1 queues 4@0 4@4 4@8 4@12 hw 1 mode channel Store the parsed count/offset pair count onto a dedicated variable that will be compared against opt.num_tc after all of the command line arguments were parsed. Bail out if this count is higher than opt.num_tc and let user know about it. Drivers were swallowing such commands as they were iterating over count/offset pairs where num_tc was used as a delimiter, so this is not a big deal, but better catch such misconfiguration at the command line argument parsing level. Signed-off-by: Maciej Fijalkowski Signed-off-by: David Ahern --- tc/q_mqprio.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tc/q_mqprio.c b/tc/q_mqprio.c index 0eb41308..f26ba8d7 100644 --- a/tc/q_mqprio.c +++ b/tc/q_mqprio.c @@ -48,6 +48,7 @@ static int mqprio_parse_opt(struct qdisc_util *qu, int argc, __u64 max_rate64[TC_QOPT_MAX_QUEUE] = {0}; __u16 shaper = TC_MQPRIO_SHAPER_DCB; __u16 mode = TC_MQPRIO_MODE_DCB; + int cnt_off_pairs = 0; struct rtattr *tail; __u32 flags = 0; @@ -94,6 +95,7 @@ static int mqprio_parse_opt(struct qdisc_util *qu, int argc, } free(tmp); idx++; + cnt_off_pairs++; } } else if (strcmp(*argv, "hw") == 0) { NEXT_ARG(); @@ -173,6 +175,12 @@ static int mqprio_parse_opt(struct qdisc_util *qu, int argc, argc--; argv++; } + if (cnt_off_pairs > opt.num_tc) { + fprintf(stderr, "queues count/offset pair count %d can not be higher than given num_tc %d\n", + cnt_off_pairs, opt.num_tc); + return -1; + } + tail = NLMSG_TAIL(n); addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt)); From 9f91f1b7b81dcfc13614ac792602f57a4094cafa Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 20 May 2020 19:57:27 -0400 Subject: [PATCH 29/33] lwtunnel: add support for rpl segment routing This patch adds support for rpl segment routing settings. Example: ip -n ns0 -6 route add 2001::3 encap rpl segs \ fe80::c8fe:beef:cafe:cafe,fe80::c8fe:beef:cafe:beef dev lowpan0 Signed-off-by: Alexander Aring Signed-off-by: David Ahern --- ip/iproute.c | 2 +- ip/iproute_lwtunnel.c | 121 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 1 deletion(-) diff --git a/ip/iproute.c b/ip/iproute.c index 07c45169..05ec2c29 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -101,7 +101,7 @@ static void usage(void) "TIME := NUMBER[s|ms]\n" "BOOL := [1|0]\n" "FEATURES := ecn\n" - "ENCAPTYPE := [ mpls | ip | ip6 | seg6 | seg6local ]\n" + "ENCAPTYPE := [ mpls | ip | ip6 | seg6 | seg6local | rpl ]\n" "ENCAPHDR := [ MPLSLABEL | SEG6HDR ]\n" "SEG6HDR := [ mode SEGMODE ] segs ADDR1,ADDRi,ADDRn [hmac HMACKEYID] [cleanup]\n" "SEGMODE := [ encap | inline ]\n" diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c index ff7c9d7f..9b4f0885 100644 --- a/ip/iproute_lwtunnel.c +++ b/ip/iproute_lwtunnel.c @@ -29,6 +29,8 @@ #include #include +#include +#include #include #include #include @@ -50,6 +52,8 @@ static const char *format_encap_type(int type) return "seg6"; case LWTUNNEL_ENCAP_SEG6_LOCAL: return "seg6local"; + case LWTUNNEL_ENCAP_RPL: + return "rpl"; default: return "unknown"; } @@ -84,6 +88,8 @@ static int read_encap_type(const char *name) return LWTUNNEL_ENCAP_SEG6; else if (strcmp(name, "seg6local") == 0) return LWTUNNEL_ENCAP_SEG6_LOCAL; + else if (strcmp(name, "rpl") == 0) + return LWTUNNEL_ENCAP_RPL; else if (strcmp(name, "help") == 0) encap_type_usage(); @@ -162,6 +168,42 @@ static void print_encap_seg6(FILE *fp, struct rtattr *encap) print_srh(fp, tuninfo->srh); } +static void print_rpl_srh(FILE *fp, struct ipv6_rpl_sr_hdr *srh) +{ + int i; + + if (is_json_context()) + open_json_array(PRINT_JSON, "segs"); + else + fprintf(fp, "segs %d [ ", srh->segments_left); + + for (i = srh->segments_left - 1; i >= 0; i--) { + print_color_string(PRINT_ANY, COLOR_INET6, + NULL, "%s ", + rt_addr_n2a(AF_INET6, 16, &srh->rpl_segaddr[i])); + } + + if (is_json_context()) + close_json_array(PRINT_JSON, NULL); + else + fprintf(fp, "] "); +} + +static void print_encap_rpl(FILE *fp, struct rtattr *encap) +{ + struct rtattr *tb[RPL_IPTUNNEL_MAX + 1]; + struct ipv6_rpl_sr_hdr *srh; + + parse_rtattr_nested(tb, RPL_IPTUNNEL_MAX, encap); + + if (!tb[RPL_IPTUNNEL_SRH]) + return; + + srh = RTA_DATA(tb[RPL_IPTUNNEL_SRH]); + + print_rpl_srh(fp, srh); +} + static const char *seg6_action_names[SEG6_LOCAL_ACTION_MAX + 1] = { [SEG6_LOCAL_ACTION_END] = "End", [SEG6_LOCAL_ACTION_END_X] = "End.X", @@ -567,6 +609,9 @@ void lwt_print_encap(FILE *fp, struct rtattr *encap_type, case LWTUNNEL_ENCAP_SEG6_LOCAL: print_encap_seg6local(fp, encap); break; + case LWTUNNEL_ENCAP_RPL: + print_encap_rpl(fp, encap); + break; } } @@ -690,6 +735,79 @@ out: return ret; } +static struct ipv6_rpl_sr_hdr *parse_rpl_srh(char *segbuf) +{ + struct ipv6_rpl_sr_hdr *srh; + int nsegs = 0; + int srhlen; + char *s; + int i; + + s = segbuf; + for (i = 0; *s; *s++ == ',' ? i++ : *s); + nsegs = i + 1; + + srhlen = 8 + 16 * nsegs; + + srh = calloc(1, srhlen); + + srh->hdrlen = (srhlen >> 3) - 1; + srh->type = 3; + srh->segments_left = nsegs; + + for (s = strtok(segbuf, ","); s; s = strtok(NULL, ",")) { + inet_prefix addr; + + get_addr(&addr, s, AF_INET6); + memcpy(&srh->rpl_segaddr[i], addr.data, sizeof(struct in6_addr)); + i--; + } + + return srh; +} + +static int parse_encap_rpl(struct rtattr *rta, size_t len, int *argcp, + char ***argvp) +{ + struct ipv6_rpl_sr_hdr *srh; + char **argv = *argvp; + char segbuf[1024] = ""; + int argc = *argcp; + int segs_ok = 0; + int ret = 0; + int srhlen; + + while (argc > 0) { + if (strcmp(*argv, "segs") == 0) { + NEXT_ARG(); + if (segs_ok++) + duparg2("segs", *argv); + + strlcpy(segbuf, *argv, 1024); + } else { + break; + } + argc--; argv++; + } + + srh = parse_rpl_srh(segbuf); + srhlen = (srh->hdrlen + 1) << 3; + + if (rta_addattr_l(rta, len, RPL_IPTUNNEL_SRH, srh, + srhlen)) { + ret = -1; + goto out; + } + + *argcp = argc + 1; + *argvp = argv - 1; + +out: + free(srh); + + return ret; +} + struct lwt_x { struct rtattr *rta; size_t len; @@ -1537,6 +1655,9 @@ int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp, case LWTUNNEL_ENCAP_SEG6_LOCAL: ret = parse_encap_seg6local(rta, len, &argc, &argv); break; + case LWTUNNEL_ENCAP_RPL: + ret = parse_encap_rpl(rta, len, &argc, &argv); + break; default: fprintf(stderr, "Error: unsupported encap type\n"); break; From 9a25abde3aa67844d3c77a7e6b68ccf8d99def43 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Tue, 26 May 2020 16:40:55 +0700 Subject: [PATCH 30/33] tipc: enable printing of broadcast rcv link stats This commit allows printing the statistics of a broadcast-receiver link using the same tipc command, but with additional 'link' options: $ tipc link stat show --help Usage: tipc link stat show [ link { LINK | SUBSTRING | all } ] With: + 'LINK' : print the stats of the specific link 'LINK'; + 'SUBSTRING' : print the stats of all the links having the 'SUBSTRING' in name; + 'all' : print all the links' stats incl. the broadcast-receiver ones; Also, a link stats can be reset in the usual way by specifying the link name in command. For example: $ tipc l st sh l br Link Window:50 packets RX packets:0 fragments:0/0 bundles:0/0 TX packets:5011125 fragments:4968774/149643 bundles:38402/307061 RX naks:781484 defs:0 dups:0 TX naks:0 acks:0 retrans:330259 Congestion link:50657 Send queue max:0 avg:0 Link Window:50 packets RX packets:95146 fragments:95040/1980 bundles:1/2 TX packets:0 fragments:0/0 bundles:0/0 RX naks:380938 defs:83962 dups:403 TX naks:8362 acks:0 retrans:170662 Congestion link:0 Send queue max:0 avg:0 Link Window:50 packets RX packets:0 fragments:0/0 bundles:0/0 TX packets:0 fragments:0/0 bundles:0/0 RX naks:400546 defs:0 dups:0 TX naks:0 acks:0 retrans:159597 Congestion link:0 Send queue max:0 avg:0 $ tipc l st sh l 1001002 Link <1001003:data0-1001002:data0> ACTIVE MTU:1500 Priority:10 Tolerance:1500 ms Window:50 packets RX packets:99546 fragments:0/0 bundles:33/877 TX packets:629 fragments:0/0 bundles:35/828 TX profile sample:8 packets average:390 octets 0-64:75% -256:0% -1024:0% -4096:25% -16384:0% -32768:0% -66000:0% RX states:488714 probes:7397 naks:0 defs:4 dups:5 TX states:27734 probes:18016 naks:5 acks:2305 retrans:0 Congestion link:0 Send queue max:0 avg:0 Link Window:50 packets RX packets:0 fragments:0/0 bundles:0/0 TX packets:0 fragments:0/0 bundles:0/0 RX naks:400546 defs:0 dups:0 TX naks:0 acks:0 retrans:159597 Congestion link:0 Send queue max:0 avg:0 $ tipc l st re l broadcast-link:1001002 $ tipc l st sh l broadcast-link:1001002 Link Window:50 packets RX packets:0 fragments:0/0 bundles:0/0 TX packets:0 fragments:0/0 bundles:0/0 RX naks:0 defs:0 dups:0 TX naks:0 acks:0 retrans:0 Congestion link:0 Send queue max:0 avg:0 Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David Ahern --- tipc/link.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/tipc/link.c b/tipc/link.c index e123c186..ba77a201 100644 --- a/tipc/link.c +++ b/tipc/link.c @@ -334,7 +334,7 @@ static int _show_link_stat(const char *name, struct nlattr *attrs[], open_json_object(NULL); - print_string(PRINT_ANY, "link", "\nLink <%s>\n", name); + print_string(PRINT_ANY, "link", "Link <%s>\n", name); print_string(PRINT_JSON, "state", "", NULL); open_json_array(PRINT_JSON, NULL); if (attrs[TIPC_NLA_LINK_ACTIVE]) @@ -433,7 +433,7 @@ static int _show_link_stat(const char *name, struct nlattr *attrs[], mnl_attr_get_u32(stats[TIPC_NLA_STATS_LINK_CONGS])); print_uint(PRINT_ANY, "send queue max", " Send queue max:%u", mnl_attr_get_u32(stats[TIPC_NLA_STATS_MAX_QUEUE])); - print_uint(PRINT_ANY, "avg", " avg:%u\n", + print_uint(PRINT_ANY, "avg", " avg:%u\n\n", mnl_attr_get_u32(stats[TIPC_NLA_STATS_AVG_QUEUE])); close_json_object(); @@ -496,7 +496,7 @@ static int _show_bc_link_stat(const char *name, struct nlattr *prop[], mnl_attr_get_u32(stats[TIPC_NLA_STATS_LINK_CONGS])); print_uint(PRINT_ANY, "send queue max", " Send queue max:%u", mnl_attr_get_u32(stats[TIPC_NLA_STATS_MAX_QUEUE])); - print_uint(PRINT_ANY, "avg", " avg:%u\n", + print_uint(PRINT_ANY, "avg", " avg:%u\n\n", mnl_attr_get_u32(stats[TIPC_NLA_STATS_AVG_QUEUE])); close_json_object(); @@ -527,8 +527,10 @@ static int link_stat_show_cb(const struct nlmsghdr *nlh, void *data) name = mnl_attr_get_str(attrs[TIPC_NLA_LINK_NAME]); - /* If a link is passed, skip all but that link */ - if (link && (strcmp(name, link) != 0)) + /* If a link is passed, skip all but that link. + * Support a substring matching as well. + */ + if (link && !strstr(name, link)) return MNL_CB_OK; if (attrs[TIPC_NLA_LINK_BROADCAST]) { @@ -540,7 +542,7 @@ static int link_stat_show_cb(const struct nlmsghdr *nlh, void *data) static void cmd_link_stat_show_help(struct cmdl *cmdl) { - fprintf(stderr, "Usage: %s link stat show [ link LINK ]\n", + fprintf(stderr, "Usage: %s link stat show [ link { LINK | SUBSTRING | all } ]\n", cmdl->argv[0]); } @@ -554,6 +556,7 @@ static int cmd_link_stat_show(struct nlmsghdr *nlh, const struct cmd *cmd, { "link", OPT_KEYVAL, NULL }, { NULL } }; + struct nlattr *attrs; int err = 0; if (help_flag) { @@ -571,8 +574,14 @@ static int cmd_link_stat_show(struct nlmsghdr *nlh, const struct cmd *cmd, return -EINVAL; opt = get_opt(opts, "link"); - if (opt) - link = opt->val; + if (opt) { + if (strcmp(opt->val, "all")) + link = opt->val; + /* Set the flag to dump all bc links */ + attrs = mnl_attr_nest_start(nlh, TIPC_NLA_LINK); + mnl_attr_put(nlh, TIPC_NLA_LINK_BROADCAST, 0, NULL); + mnl_attr_nest_end(nlh, attrs); + } new_json_obj(json); err = msg_dumpit(nlh, link_stat_show_cb, link); From bd4b8c632e92526e60bc18bd8453599ea78fa42f Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 27 May 2020 21:22:47 -0400 Subject: [PATCH 31/33] tc: report time an action was first used Have print_tm() dump firstuse value along with install, lastuse and expires. v2: Resubmit after 'master' merged into next Signed-off-by: Roman Mashak Signed-off-by: David Ahern --- tc/tc_util.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tc/tc_util.c b/tc/tc_util.c index fd5fcb24..b7ff911b 100644 --- a/tc/tc_util.c +++ b/tc/tc_util.c @@ -758,6 +758,10 @@ void print_tm(FILE *f, const struct tcf_t *tm) print_uint(PRINT_ANY, "last_used", " used %u sec", tm->lastuse / hz); + if (tm->firstuse != 0) + print_uint(PRINT_ANY, "first_used", " firstused %u sec", + tm->firstuse / hz); + if (tm->expires != 0) print_uint(PRINT_ANY, "expires", " expires %u sec", tm->expires / hz); From 9d59c86e575b5373d73f021f569ae520bc229ec5 Mon Sep 17 00:00:00 2001 From: "Ian K. Coolidge" Date: Wed, 27 May 2020 11:03:45 -0700 Subject: [PATCH 32/33] iproute2: ip addr: Organize flag properties structurally This creates a nice systematic way to check that the various flags are mutable from userspace and that the address family is valid. Mutability properties are preserved to avoid introducing any behavioral change in this CL. However, previously, immutable flags were ignored and fell through to this confusing error: Error: either "local" is duplicate, or "dadfailed" is a garbage. But now, they just warn more explicitly: Warning: dadfailed option is not mutable from userspace Signed-off-by: David Ahern --- ip/ipaddress.c | 112 ++++++++++++++++++++++++------------------------- 1 file changed, 55 insertions(+), 57 deletions(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 80d27ce2..403f7010 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -1233,52 +1233,63 @@ static unsigned int get_ifa_flags(struct ifaddrmsg *ifa, ifa->ifa_flags; } -/* Mapping from argument to address flag mask */ -static const struct { +/* Mapping from argument to address flag mask and attributes */ +static const struct ifa_flag_data_t { const char *name; - unsigned long value; -} ifa_flag_names[] = { - { "secondary", IFA_F_SECONDARY }, - { "temporary", IFA_F_SECONDARY }, - { "nodad", IFA_F_NODAD }, - { "optimistic", IFA_F_OPTIMISTIC }, - { "dadfailed", IFA_F_DADFAILED }, - { "home", IFA_F_HOMEADDRESS }, - { "deprecated", IFA_F_DEPRECATED }, - { "tentative", IFA_F_TENTATIVE }, - { "permanent", IFA_F_PERMANENT }, - { "mngtmpaddr", IFA_F_MANAGETEMPADDR }, - { "noprefixroute", IFA_F_NOPREFIXROUTE }, - { "autojoin", IFA_F_MCAUTOJOIN }, - { "stable-privacy", IFA_F_STABLE_PRIVACY }, + unsigned long mask; + bool readonly; + bool v6only; +} ifa_flag_data[] = { + { .name = "secondary", .mask = IFA_F_SECONDARY, .readonly = true, .v6only = false}, + { .name = "temporary", .mask = IFA_F_SECONDARY, .readonly = true, .v6only = false}, + { .name = "nodad", .mask = IFA_F_NODAD, .readonly = false, .v6only = true}, + { .name = "optimistic", .mask = IFA_F_OPTIMISTIC, .readonly = true, .v6only = true}, + { .name = "dadfailed", .mask = IFA_F_DADFAILED, .readonly = true, .v6only = true}, + { .name = "home", .mask = IFA_F_HOMEADDRESS, .readonly = false, .v6only = true}, + { .name = "deprecated", .mask = IFA_F_DEPRECATED, .readonly = true, .v6only = true}, + { .name = "tentative", .mask = IFA_F_TENTATIVE, .readonly = true, .v6only = true}, + { .name = "permanent", .mask = IFA_F_PERMANENT, .readonly = true, .v6only = true}, + { .name = "mngtmpaddr", .mask = IFA_F_MANAGETEMPADDR, .readonly = false, .v6only = true}, + { .name = "noprefixroute", .mask = IFA_F_NOPREFIXROUTE, .readonly = false, .v6only = true}, + { .name = "autojoin", .mask = IFA_F_MCAUTOJOIN, .readonly = false, .v6only = true}, + { .name = "stable-privacy", .mask = IFA_F_STABLE_PRIVACY, .readonly = true, .v6only = true}, }; +/* Returns a pointer to the data structure for a particular interface flag, or null if no flag could be found */ +static const struct ifa_flag_data_t* lookup_flag_data_by_name(const char* flag_name) { + for (int i = 0; i < ARRAY_SIZE(ifa_flag_data); ++i) { + if (strcmp(flag_name, ifa_flag_data[i].name) == 0) + return &ifa_flag_data[i]; + } + return NULL; +} + static void print_ifa_flags(FILE *fp, const struct ifaddrmsg *ifa, unsigned int flags) { unsigned int i; - for (i = 0; i < ARRAY_SIZE(ifa_flag_names); i++) { - unsigned long mask = ifa_flag_names[i].value; + for (i = 0; i < ARRAY_SIZE(ifa_flag_data); i++) { + const struct ifa_flag_data_t* flag_data = &ifa_flag_data[i]; - if (mask == IFA_F_PERMANENT) { - if (!(flags & mask)) + if (flag_data->mask == IFA_F_PERMANENT) { + if (!(flags & flag_data->mask)) print_bool(PRINT_ANY, "dynamic", "dynamic ", true); - } else if (flags & mask) { - if (mask == IFA_F_SECONDARY && + } else if (flags & flag_data->mask) { + if (flag_data->mask == IFA_F_SECONDARY && ifa->ifa_family == AF_INET6) { print_bool(PRINT_ANY, "temporary", "temporary ", true); } else { print_string(PRINT_FP, NULL, - "%s ", ifa_flag_names[i].name); + "%s ", flag_data->name); print_bool(PRINT_JSON, - ifa_flag_names[i].name, NULL, true); + flag_data->name, NULL, true); } } - flags &= ~mask; + flags &= ~flag_data->mask; } if (flags) { @@ -1297,7 +1308,6 @@ static void print_ifa_flags(FILE *fp, const struct ifaddrmsg *ifa, static int get_filter(const char *arg) { bool inv = false; - unsigned int i; if (arg[0] == '-') { inv = true; @@ -1313,18 +1323,16 @@ static int get_filter(const char *arg) arg = "secondary"; } - for (i = 0; i < ARRAY_SIZE(ifa_flag_names); i++) { - if (strcmp(arg, ifa_flag_names[i].name)) - continue; + const struct ifa_flag_data_t* flag_data = lookup_flag_data_by_name(arg); + if (flag_data == NULL) + return -1; - if (inv) - filter.flags &= ~ifa_flag_names[i].value; - else - filter.flags |= ifa_flag_names[i].value; - filter.flagmask |= ifa_flag_names[i].value; - return 0; - } - return -1; + if (inv) + filter.flags &= ~flag_data->mask; + else + filter.flags |= flag_data->mask; + filter.flagmask |= flag_data->mask; + return 0; } static int ifa_label_match_rta(int ifindex, const struct rtattr *rta) @@ -2330,25 +2338,15 @@ static int ipaddr_modify(int cmd, int flags, int argc, char **argv) preferred_lftp = *argv; if (set_lifetime(&preferred_lft, *argv)) invarg("preferred_lft value", *argv); - } else if (strcmp(*argv, "home") == 0) { - if (req.ifa.ifa_family == AF_INET6) - ifa_flags |= IFA_F_HOMEADDRESS; - else - fprintf(stderr, "Warning: home option can be set only for IPv6 addresses\n"); - } else if (strcmp(*argv, "nodad") == 0) { - if (req.ifa.ifa_family == AF_INET6) - ifa_flags |= IFA_F_NODAD; - else - fprintf(stderr, "Warning: nodad option can be set only for IPv6 addresses\n"); - } else if (strcmp(*argv, "mngtmpaddr") == 0) { - if (req.ifa.ifa_family == AF_INET6) - ifa_flags |= IFA_F_MANAGETEMPADDR; - else - fprintf(stderr, "Warning: mngtmpaddr option can be set only for IPv6 addresses\n"); - } else if (strcmp(*argv, "noprefixroute") == 0) { - ifa_flags |= IFA_F_NOPREFIXROUTE; - } else if (strcmp(*argv, "autojoin") == 0) { - ifa_flags |= IFA_F_MCAUTOJOIN; + } else if (lookup_flag_data_by_name(*argv)) { + const struct ifa_flag_data_t* flag_data = lookup_flag_data_by_name(*argv); + if (flag_data->readonly) { + fprintf(stderr, "Warning: %s option is not mutable from userspace\n", flag_data->name); + } else if (flag_data->v6only && req.ifa.ifa_family != AF_INET6) { + fprintf(stderr, "Warning: %s option can be set only for IPv6 addresses\n", flag_data->name); + } else { + ifa_flags |= flag_data->mask; + } } else { if (strcmp(*argv, "local") == 0) NEXT_ARG(); From 5413a735a68de792eb7db1644fb3e3b888356863 Mon Sep 17 00:00:00 2001 From: "Ian K. Coolidge" Date: Wed, 27 May 2020 11:03:46 -0700 Subject: [PATCH 33/33] iproute2: ip addr: Add support for setting 'optimistic' optimistic DAD is controllable via sysctl for an interface or all interfaces on the system. This would affect addresses added by the kernel only. Recent kernels, however, have enabled support for adding optimistic address via userspace. This plumbs that support. Signed-off-by: David Ahern --- ip/ipaddress.c | 2 +- man/man8/ip-address.8.in | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 403f7010..3b53933f 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -1243,7 +1243,7 @@ static const struct ifa_flag_data_t { { .name = "secondary", .mask = IFA_F_SECONDARY, .readonly = true, .v6only = false}, { .name = "temporary", .mask = IFA_F_SECONDARY, .readonly = true, .v6only = false}, { .name = "nodad", .mask = IFA_F_NODAD, .readonly = false, .v6only = true}, - { .name = "optimistic", .mask = IFA_F_OPTIMISTIC, .readonly = true, .v6only = true}, + { .name = "optimistic", .mask = IFA_F_OPTIMISTIC, .readonly = false, .v6only = true}, { .name = "dadfailed", .mask = IFA_F_DADFAILED, .readonly = true, .v6only = true}, { .name = "home", .mask = IFA_F_HOMEADDRESS, .readonly = false, .v6only = true}, { .name = "deprecated", .mask = IFA_F_DEPRECATED, .readonly = true, .v6only = true}, diff --git a/man/man8/ip-address.8.in b/man/man8/ip-address.8.in index 2a553190..fe773c91 100644 --- a/man/man8/ip-address.8.in +++ b/man/man8/ip-address.8.in @@ -92,7 +92,7 @@ ip-address \- protocol address management .ti -8 .IR CONFFLAG " := " -.RB "[ " home " | " mngtmpaddr " | " nodad " | " noprefixroute " | " autojoin " ]" +.RB "[ " home " | " mngtmpaddr " | " nodad " | " optimstic " | " noprefixroute " | " autojoin " ]" .ti -8 .IR LIFETIME " := [ " @@ -258,6 +258,11 @@ stateless auto-configuration was active. (IPv6 only) do not perform Duplicate Address Detection (RFC 4862) when adding this address. +.TP +.B optimistic +(IPv6 only) When performing Duplicate Address Detection, use the RFC 4429 +optimistic variant. + .TP .B noprefixroute Do not automatically create a route for the network prefix of the added