From b2e8bf158460568ec5b48cba69f657f95891c901 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 30 Oct 2018 15:03:30 -0700 Subject: [PATCH 01/50] ip rule: Add ipproto and port range to filter list Allow ip rule dumps and flushes to filter based on ipproto, sport and dport. Example: $ ip ru ls ipproto udp 99: from all to 8.8.8.8 ipproto udp dport 53 lookup 1001 $ ip ru ls dport 53 99: from all to 8.8.8.8 ipproto udp dport 53 lookup 1001 Signed-off-by: David Ahern --- ip/iprule.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/ip/iprule.c b/ip/iprule.c index a85a4390..9aa411ce 100644 --- a/ip/iprule.c +++ b/ip/iprule.c @@ -78,6 +78,9 @@ static struct inet_prefix dst; int protocol; int protocolmask; + struct fib_rule_port_range sport; + struct fib_rule_port_range dport; + __u8 ipproto; } filter; static inline int frh_get_table(struct fib_rule_hdr *frh, struct rtattr **tb) @@ -174,6 +177,39 @@ static bool filter_nlmsg(struct nlmsghdr *n, struct rtattr **tb, int host_len) return false; } + if (filter.ipproto) { + __u8 ipproto = 0; + + if (tb[FRA_IP_PROTO]) + ipproto = rta_getattr_u8(tb[FRA_IP_PROTO]); + if (filter.ipproto != ipproto) + return false; + } + + if (filter.sport.start) { + const struct fib_rule_port_range *r; + + if (!tb[FRA_SPORT_RANGE]) + return false; + + r = RTA_DATA(tb[FRA_SPORT_RANGE]); + if (r->start != filter.sport.start || + r->end != filter.sport.end) + return false; + } + + if (filter.dport.start) { + const struct fib_rule_port_range *r; + + if (!tb[FRA_DPORT_RANGE]) + return false; + + r = RTA_DATA(tb[FRA_DPORT_RANGE]); + if (r->start != filter.dport.start || + r->end != filter.dport.end) + return false; + } + table = frh_get_table(frh, tb); if (filter.tb > 0 && filter.tb ^ table) return false; @@ -607,6 +643,36 @@ static int iprule_list_flush_or_save(int argc, char **argv, int action) filter.protocolmask = 0; } filter.protocol = prot; + } else if (strcmp(*argv, "ipproto") == 0) { + int ipproto; + + NEXT_ARG(); + ipproto = inet_proto_a2n(*argv); + if (ipproto < 0) + invarg("Invalid \"ipproto\" value\n", *argv); + filter.ipproto = ipproto; + } else if (strcmp(*argv, "sport") == 0) { + struct fib_rule_port_range r; + int ret; + + NEXT_ARG(); + ret = sscanf(*argv, "%hu-%hu", &r.start, &r.end); + if (ret == 1) + r.end = r.start; + else if (ret != 2) + invarg("invalid port range\n", *argv); + filter.sport = r; + } else if (strcmp(*argv, "dport") == 0) { + struct fib_rule_port_range r; + int ret; + + NEXT_ARG(); + ret = sscanf(*argv, "%hu-%hu", &r.start, &r.end); + if (ret == 1) + r.end = r.start; + else if (ret != 2) + invarg("invalid dport range\n", *argv); + filter.dport = r; } else{ if (matches(*argv, "dst") == 0 || matches(*argv, "to") == 0) { From 3fb00075d904389afce507fffe06ca3a8500ebf3 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 31 Oct 2018 09:17:55 +0200 Subject: [PATCH 02/50] rdma: Update kernel include file to support IB device renaming Bring kernel header file changes upto commit 05d940d3a3ec ("RDMA/nldev: Allow IB device rename through RDMA netlink") Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: David Ahern --- rdma/include/uapi/rdma/rdma_netlink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rdma/include/uapi/rdma/rdma_netlink.h b/rdma/include/uapi/rdma/rdma_netlink.h index 6513fb89..e2228c09 100644 --- a/rdma/include/uapi/rdma/rdma_netlink.h +++ b/rdma/include/uapi/rdma/rdma_netlink.h @@ -227,8 +227,9 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_UNSPEC, RDMA_NLDEV_CMD_GET, /* can dump */ + RDMA_NLDEV_CMD_SET, - /* 2 - 4 are free to use */ + /* 3 - 4 are free to use */ RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */ From a14ceed32524c7f9c05572886cd63e921e4c0faf Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 31 Oct 2018 09:17:56 +0200 Subject: [PATCH 03/50] rdma: Introduce command execution helper with required device name In contradiction to various show commands, the set command explicitly requires to use device name as an argument. Provide new command execution helper which enforces it. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: David Ahern --- rdma/rdma.h | 1 + rdma/utils.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/rdma/rdma.h b/rdma/rdma.h index c3b7530b..547bb574 100644 --- a/rdma/rdma.h +++ b/rdma/rdma.h @@ -90,6 +90,7 @@ int cmd_link(struct rd *rd); int cmd_res(struct rd *rd); int rd_exec_cmd(struct rd *rd, const struct rd_cmd *c, const char *str); int rd_exec_dev(struct rd *rd, int (*cb)(struct rd *rd)); +int rd_exec_require_dev(struct rd *rd, int (*cb)(struct rd *rd)); int rd_exec_link(struct rd *rd, int (*cb)(struct rd *rd), bool strict_port); void rd_free(struct rd *rd); int rd_set_arg_to_devname(struct rd *rd); diff --git a/rdma/utils.c b/rdma/utils.c index 4840bf22..61f4aeb1 100644 --- a/rdma/utils.c +++ b/rdma/utils.c @@ -577,6 +577,16 @@ out: return ret; } +int rd_exec_require_dev(struct rd *rd, int (*cb)(struct rd *rd)) +{ + if (rd_no_arg(rd)) { + pr_err("Please provide device name.\n"); + return -EINVAL; + } + + return rd_exec_dev(rd, cb); +} + int rd_exec_cmd(struct rd *rd, const struct rd_cmd *cmds, const char *str) { const struct rd_cmd *c; From 4443c9c6a01eac8c8f2743d4d185ceb9be4d1207 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 31 Oct 2018 09:17:57 +0200 Subject: [PATCH 04/50] rdma: Add an option to rename IB device interface Enrich rdmatool with an option to rename IB devices, the command interface follows Iproute2 convention: "rdma dev set [OLD-DEVNAME] name NEW-DEVNAME" Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: David Ahern --- rdma/dev.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/rdma/dev.c b/rdma/dev.c index e2eafe47..760b7fb3 100644 --- a/rdma/dev.c +++ b/rdma/dev.c @@ -14,6 +14,7 @@ static int dev_help(struct rd *rd) { pr_out("Usage: %s dev show [DEV]\n", rd->filename); + pr_out(" %s dev set [DEV] name DEVNAME\n", rd->filename); return 0; } @@ -240,17 +241,51 @@ static int dev_one_show(struct rd *rd) return rd_exec_cmd(rd, cmds, "parameter"); } +static int dev_set_name(struct rd *rd) +{ + uint32_t seq; + + if (rd_no_arg(rd)) { + pr_err("Please provide device new name.\n"); + return -EINVAL; + } + + rd_prepare_msg(rd, RDMA_NLDEV_CMD_SET, + &seq, (NLM_F_REQUEST | NLM_F_ACK)); + mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx); + mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd_argv(rd)); + + return rd_send_msg(rd); +} + +static int dev_one_set(struct rd *rd) +{ + const struct rd_cmd cmds[] = { + { NULL, dev_help}, + { "name", dev_set_name}, + { 0 } + }; + + return rd_exec_cmd(rd, cmds, "parameter"); +} + static int dev_show(struct rd *rd) { return rd_exec_dev(rd, dev_one_show); } +static int dev_set(struct rd *rd) +{ + return rd_exec_require_dev(rd, dev_one_set); +} + int cmd_dev(struct rd *rd) { const struct rd_cmd cmds[] = { { NULL, dev_show }, { "show", dev_show }, { "list", dev_show }, + { "set", dev_set }, { "help", dev_help }, { 0 } }; From e89feffae3626da5b5fda352ae73db132ac60a47 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 4 Nov 2018 21:11:22 +0200 Subject: [PATCH 05/50] rdma: Document IB device renaming option [leonro@server /]$ lspci |grep -i Ether 00:08.0 Ethernet controller: Red Hat, Inc. Virtio network device 00:09.0 Ethernet controller: Mellanox Technologies MT27700 Family [ConnectX-4] [leonro@server /]$ sudo rdma dev 1: mlx5_0: node_type ca fw 3.8.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455 [leonro@server /]$ sudo rdma dev set mlx5_0 name hfi1_0 [leonro@server /]$ sudo rdma dev 1: hfi1_0: node_type ca fw 3.8.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455 Signed-off-by: Leon Romanovsky Signed-off-by: David Ahern --- man/man8/rdma-dev.8 | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/man/man8/rdma-dev.8 b/man/man8/rdma-dev.8 index 461681b6..7c275180 100644 --- a/man/man8/rdma-dev.8 +++ b/man/man8/rdma-dev.8 @@ -1,6 +1,6 @@ .TH RDMA\-DEV 8 "06 Jul 2017" "iproute2" "Linux" .SH NAME -rdmak-dev \- RDMA device configuration +rdma-dev \- RDMA device configuration .SH SYNOPSIS .sp .ad l @@ -22,10 +22,18 @@ rdmak-dev \- RDMA device configuration .B rdma dev show .RI "[ " DEV " ]" +.ti -8 +.B rdma dev set +.RI "[ " DEV " ]" +.BR name +.BR NEWNAME + .ti -8 .B rdma dev help .SH "DESCRIPTION" +.SS rdma dev set - rename rdma device + .SS rdma dev show - display rdma device attributes .PP @@ -45,6 +53,11 @@ rdma dev show mlx5_3 Shows the state of specified RDMA device. .RE .PP +rdma dev set mlx5_3 name rdma_0 +.RS 4 +Renames the mlx5_3 device to rdma_0. +.RE +.PP .SH SEE ALSO .BR rdma (8), From 3d98eba4fe369460d13a53eab1b1792d7ffa6cf5 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 8 Nov 2018 12:21:25 +0100 Subject: [PATCH 06/50] iplink_vxlan: Add DF configuration Allow to set the DF bit behaviour for outgoing IPv4 packets: it can be always on, inherited from the inner header, or, by default, always off, which is the current behaviour. v2: - Indicate in the man page what DF refers to, using RFC 791 wording (David Ahern) Signed-off-by: Stefano Brivio Signed-off-by: David Ahern --- include/uapi/linux/if_link.h | 9 +++++++++ ip/iplink_vxlan.c | 29 +++++++++++++++++++++++++++++ man/man8/ip-link.8.in | 14 ++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 52e95197..52512202 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -531,6 +531,7 @@ enum { IFLA_VXLAN_LABEL, IFLA_VXLAN_GPE, IFLA_VXLAN_TTL_INHERIT, + IFLA_VXLAN_DF, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) @@ -540,6 +541,14 @@ struct ifla_vxlan_port_range { __be16 high; }; +enum ifla_vxlan_df { + VXLAN_DF_UNSET = 0, + VXLAN_DF_SET, + VXLAN_DF_INHERIT, + __VXLAN_DF_END, + VXLAN_DF_MAX = __VXLAN_DF_END - 1, +}; + /* GENEVE section */ enum { IFLA_GENEVE_UNSPEC, diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c index 7fc0e2b4..86afbe13 100644 --- a/ip/iplink_vxlan.c +++ b/ip/iplink_vxlan.c @@ -31,6 +31,7 @@ static void print_explain(FILE *f) " [ local ADDR ]\n" " [ ttl TTL ]\n" " [ tos TOS ]\n" + " [ df DF ]\n" " [ flowlabel LABEL ]\n" " [ dev PHYS_DEV ]\n" " [ dstport PORT ]\n" @@ -52,6 +53,7 @@ static void print_explain(FILE *f) " ADDR := { IP_ADDRESS | any }\n" " TOS := { NUMBER | inherit }\n" " TTL := { 1..255 | auto | inherit }\n" + " DF := { unset | set | inherit }\n" " LABEL := 0-1048575\n" ); } @@ -170,6 +172,22 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, } else tos = 1; addattr8(n, 1024, IFLA_VXLAN_TOS, tos); + } else if (!matches(*argv, "df")) { + enum ifla_vxlan_df df; + + NEXT_ARG(); + check_duparg(&attrs, IFLA_VXLAN_DF, "df", *argv); + if (strcmp(*argv, "unset") == 0) + df = VXLAN_DF_UNSET; + else if (strcmp(*argv, "set") == 0) + df = VXLAN_DF_SET; + else if (strcmp(*argv, "inherit") == 0) + df = VXLAN_DF_INHERIT; + else + invarg("DF must be 'unset', 'set' or 'inherit'", + *argv); + + addattr8(n, 1024, IFLA_VXLAN_DF, df); } else if (!matches(*argv, "label") || !matches(*argv, "flowlabel")) { __u32 uval; @@ -538,6 +556,17 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) print_string(PRINT_FP, NULL, "ttl %s ", "auto"); } + if (tb[IFLA_VXLAN_DF]) { + enum ifla_vxlan_df df = rta_getattr_u8(tb[IFLA_VXLAN_DF]); + + if (df == VXLAN_DF_UNSET) + print_string(PRINT_JSON, "df", "df %s ", "unset"); + else if (df == VXLAN_DF_SET) + print_string(PRINT_ANY, "df", "df %s ", "set"); + else if (df == VXLAN_DF_INHERIT) + print_string(PRINT_ANY, "df", "df %s ", "inherit"); + } + if (tb[IFLA_VXLAN_LABEL]) { __u32 label = rta_getattr_u32(tb[IFLA_VXLAN_LABEL]); diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 5132f514..a94cf4f1 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -496,6 +496,8 @@ the following additional arguments are supported: ] [ .BI tos " TOS " ] [ +.BI df " DF " +] [ .BI flowlabel " FLOWLABEL " ] [ .BI dstport " PORT " @@ -565,6 +567,18 @@ parameter. .BI tos " TOS" - specifies the TOS value to use in outgoing packets. +.sp +.BI df " DF" +- specifies the usage of the Don't Fragment flag (DF) bit in outgoing packets +with IPv4 headers. The value +.B inherit +causes the bit to be copied from the original IP header. The values +.B unset +and +.B set +cause the bit to be always unset or always set, respectively. By default, the +bit is not set. + .sp .BI flowlabel " FLOWLABEL" - specifies the flow label to use in outgoing packets. From 64dbd03ea1266ebfb693d2e0805e366255f35f47 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 8 Nov 2018 12:21:26 +0100 Subject: [PATCH 07/50] iplink_geneve: Add DF configuration Allow to set the DF bit behaviour for outgoing IPv4 packets: it can be always on, inherited from the inner header, or, by default, always off, which is the current behaviour. v2: - Indicate in the man page what DF refers to, using RFC 791 wording (David Ahern) Signed-off-by: Stefano Brivio Signed-off-by: David Ahern --- include/uapi/linux/if_link.h | 9 +++++++++ ip/iplink_geneve.c | 29 +++++++++++++++++++++++++++++ man/man8/ip-link.8.in | 14 ++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 52512202..e1ef848a 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -564,10 +564,19 @@ enum { IFLA_GENEVE_UDP_ZERO_CSUM6_RX, IFLA_GENEVE_LABEL, IFLA_GENEVE_TTL_INHERIT, + IFLA_GENEVE_DF, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) +enum ifla_geneve_df { + GENEVE_DF_UNSET = 0, + GENEVE_DF_SET, + GENEVE_DF_INHERIT, + __GENEVE_DF_END, + GENEVE_DF_MAX = __GENEVE_DF_END - 1, +}; + /* PPP section */ enum { IFLA_PPP_UNSPEC, diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c index c417842b..1872b74c 100644 --- a/ip/iplink_geneve.c +++ b/ip/iplink_geneve.c @@ -24,6 +24,7 @@ static void print_explain(FILE *f) " remote ADDR\n" " [ ttl TTL ]\n" " [ tos TOS ]\n" + " [ df DF ]\n" " [ flowlabel LABEL ]\n" " [ dstport PORT ]\n" " [ [no]external ]\n" @@ -35,6 +36,7 @@ static void print_explain(FILE *f) " ADDR := IP_ADDRESS\n" " TOS := { NUMBER | inherit }\n" " TTL := { 1..255 | auto | inherit }\n" + " DF := { unset | set | inherit }\n" " LABEL := 0-1048575\n" ); } @@ -115,6 +117,22 @@ static int geneve_parse_opt(struct link_util *lu, int argc, char **argv, tos = uval; } else tos = 1; + } else if (!matches(*argv, "df")) { + enum ifla_geneve_df df; + + NEXT_ARG(); + check_duparg(&attrs, IFLA_GENEVE_DF, "df", *argv); + if (strcmp(*argv, "unset") == 0) + df = GENEVE_DF_UNSET; + else if (strcmp(*argv, "set") == 0) + df = GENEVE_DF_SET; + else if (strcmp(*argv, "inherit") == 0) + df = GENEVE_DF_INHERIT; + else + invarg("DF must be 'unset', 'set' or 'inherit'", + *argv); + + addattr8(n, 1024, IFLA_GENEVE_DF, df); } else if (!matches(*argv, "label") || !matches(*argv, "flowlabel")) { __u32 uval; @@ -287,6 +305,17 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) print_string(PRINT_FP, NULL, "tos %s ", "inherit"); } + if (tb[IFLA_GENEVE_DF]) { + enum ifla_geneve_df df = rta_getattr_u8(tb[IFLA_GENEVE_DF]); + + if (df == GENEVE_DF_UNSET) + print_string(PRINT_JSON, "df", "df %s ", "unset"); + else if (df == GENEVE_DF_SET) + print_string(PRINT_ANY, "df", "df %s ", "set"); + else if (df == GENEVE_DF_INHERIT) + print_string(PRINT_ANY, "df", "df %s ", "inherit"); + } + if (tb[IFLA_GENEVE_LABEL]) { __u32 label = rta_getattr_u32(tb[IFLA_GENEVE_LABEL]); diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index a94cf4f1..73d37c19 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -1180,6 +1180,8 @@ the following additional arguments are supported: ] [ .BI tos " TOS " ] [ +.BI df " DF " +] [ .BI flowlabel " FLOWLABEL " ] [ .BI dstport " PORT" @@ -1212,6 +1214,18 @@ ttl. Default option is "0". .BI tos " TOS" - specifies the TOS value to use in outgoing packets. +.sp +.BI df " DF" +- specifies the usage of the Don't Fragment flag (DF) bit in outgoing packets +with IPv4 headers. The value +.B inherit +causes the bit to be copied from the original IP header. The values +.B unset +and +.B set +cause the bit to be always unset or always set, respectively. By default, the +bit is not set. + .sp .BI flowlabel " FLOWLABEL" - specifies the flow label to use in outgoing packets. From 8d42678dfb0171d11aaf5414bd856e6863e5d7ba Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 20 Nov 2018 14:33:09 -0800 Subject: [PATCH 08/50] Update kernel headers Update kernel headers to b1a200484143 ("net-next/hinic: fix a bug in rx data flow") Signed-off-by: David Ahern --- include/uapi/linux/pkt_cls.h | 7 +++++++ include/uapi/linux/pkt_sched.h | 30 ++++++++++++++++++++++++++++++ include/uapi/linux/sctp.h | 16 +++++++++++++++- include/uapi/linux/tcp.h | 1 + 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 401d0c1e..95d0db2a 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -485,6 +485,11 @@ enum { TCA_FLOWER_IN_HW_COUNT, + TCA_FLOWER_KEY_PORT_SRC_MIN, /* be16 */ + TCA_FLOWER_KEY_PORT_SRC_MAX, /* be16 */ + TCA_FLOWER_KEY_PORT_DST_MIN, /* be16 */ + TCA_FLOWER_KEY_PORT_DST_MAX, /* be16 */ + __TCA_FLOWER_MAX, }; @@ -518,6 +523,8 @@ enum { TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), }; +#define TCA_FLOWER_MASK_FLAGS_RANGE (1 << 0) /* Range-based match */ + /* Match-all classifier */ enum { diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 89ee47c2..0d18b1d1 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -291,11 +291,38 @@ enum { TCA_GRED_DPS, TCA_GRED_MAX_P, TCA_GRED_LIMIT, + TCA_GRED_VQ_LIST, /* nested TCA_GRED_VQ_ENTRY */ __TCA_GRED_MAX, }; #define TCA_GRED_MAX (__TCA_GRED_MAX - 1) +enum { + TCA_GRED_VQ_ENTRY_UNSPEC, + TCA_GRED_VQ_ENTRY, /* nested TCA_GRED_VQ_* */ + __TCA_GRED_VQ_ENTRY_MAX, +}; +#define TCA_GRED_VQ_ENTRY_MAX (__TCA_GRED_VQ_ENTRY_MAX - 1) + +enum { + TCA_GRED_VQ_UNSPEC, + TCA_GRED_VQ_PAD, + TCA_GRED_VQ_DP, /* u32 */ + TCA_GRED_VQ_STAT_BYTES, /* u64 */ + TCA_GRED_VQ_STAT_PACKETS, /* u32 */ + TCA_GRED_VQ_STAT_BACKLOG, /* u32 */ + TCA_GRED_VQ_STAT_PROB_DROP, /* u32 */ + TCA_GRED_VQ_STAT_PROB_MARK, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_DROP, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_MARK, /* u32 */ + TCA_GRED_VQ_STAT_PDROP, /* u32 */ + TCA_GRED_VQ_STAT_OTHER, /* u32 */ + TCA_GRED_VQ_FLAGS, /* u32 */ + __TCA_GRED_VQ_MAX +}; + +#define TCA_GRED_VQ_MAX (__TCA_GRED_VQ_MAX - 1) + struct tc_gred_qopt { __u32 limit; /* HARD maximal queue length (bytes) */ __u32 qth_min; /* Min average length threshold (bytes) */ @@ -864,6 +891,8 @@ enum { TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */ + TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */ + __TCA_FQ_MAX }; @@ -882,6 +911,7 @@ struct tc_fq_qd_stats { __u32 inactive_flows; __u32 throttled_flows; __u32 unthrottle_latency_ns; + __u64 ce_mark; /* packets above ce_threshold */ }; /* Heavy-Hitter Filter */ diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 626480b6..e9970b69 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -129,6 +129,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_STREAM_SCHEDULER_VALUE 124 #define SCTP_INTERLEAVING_SUPPORTED 125 #define SCTP_SENDMSG_CONNECT 126 +#define SCTP_EVENT 127 /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 @@ -568,6 +569,8 @@ struct sctp_assoc_reset_event { #define SCTP_ASSOC_CHANGE_DENIED 0x0004 #define SCTP_ASSOC_CHANGE_FAILED 0x0008 +#define SCTP_STREAM_CHANGE_DENIED SCTP_ASSOC_CHANGE_DENIED +#define SCTP_STREAM_CHANGE_FAILED SCTP_ASSOC_CHANGE_FAILED struct sctp_stream_change_event { __u16 strchange_type; __u16 strchange_flags; @@ -630,7 +633,9 @@ union sctp_notification { */ enum sctp_sn_type { - SCTP_SN_TYPE_BASE = (1<<15), + SCTP_SN_TYPE_BASE = (1<<15), + SCTP_DATA_IO_EVENT = SCTP_SN_TYPE_BASE, +#define SCTP_DATA_IO_EVENT SCTP_DATA_IO_EVENT SCTP_ASSOC_CHANGE, #define SCTP_ASSOC_CHANGE SCTP_ASSOC_CHANGE SCTP_PEER_ADDR_CHANGE, @@ -655,6 +660,8 @@ enum sctp_sn_type { #define SCTP_ASSOC_RESET_EVENT SCTP_ASSOC_RESET_EVENT SCTP_STREAM_CHANGE_EVENT, #define SCTP_STREAM_CHANGE_EVENT SCTP_STREAM_CHANGE_EVENT + SCTP_SN_TYPE_MAX = SCTP_STREAM_CHANGE_EVENT, +#define SCTP_SN_TYPE_MAX SCTP_SN_TYPE_MAX }; /* Notification error codes used to fill up the error fields in some @@ -1142,9 +1149,16 @@ struct sctp_add_streams { uint16_t sas_outstrms; }; +struct sctp_event { + sctp_assoc_t se_assoc_id; + uint16_t se_type; + uint8_t se_on; +}; + /* SCTP Stream schedulers */ enum sctp_sched_type { SCTP_SS_FCFS, + SCTP_SS_DEFAULT = SCTP_SS_FCFS, SCTP_SS_PRIO, SCTP_SS_RR, SCTP_SS_MAX = SCTP_SS_RR diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 6ec77662..799b5c5f 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -266,6 +266,7 @@ enum { TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */ TCP_NLA_DSACK_DUPS, /* DSACK blocks received */ TCP_NLA_REORD_SEEN, /* reordering events seen */ + TCP_NLA_SRTT, /* smoothed RTT in usecs */ }; /* for TCP_MD5SIG socket option */ From e20e50b0c1643d371dc6597dfc38355d235c84a0 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Thu, 15 Nov 2018 16:55:13 -0800 Subject: [PATCH 09/50] tc: flower: Classify packets based port ranges Added support for filtering based on port ranges. UAPI changes have been accepted into net-next. Example: 1. Match on a port range: ------------------------- $ tc filter add dev enp4s0 protocol ip parent ffff:\ prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\ action drop $ tc -s filter show dev enp4s0 parent ffff: filter protocol ip pref 1 flower chain 0 filter protocol ip pref 1 flower chain 0 handle 0x1 eth_type ipv4 ip_proto tcp dst_port range 20-30 skip_hw not_in_hw action order 1: gact action drop random type none pass val 0 index 1 ref 1 bind 1 installed 85 sec used 3 sec Action statistics: Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0) backlog 0b 0p requeues 0 2. Match on IP address and port range: -------------------------------------- $ tc filter add dev enp4s0 protocol ip parent ffff:\ prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\ skip_hw action drop $ tc -s filter show dev enp4s0 parent ffff: filter protocol ip pref 1 flower chain 0 handle 0x2 eth_type ipv4 ip_proto tcp dst_ip 192.168.1.1 dst_port range 100-200 skip_hw not_in_hw action order 1: gact action drop random type none pass val 0 index 2 ref 1 bind 1 installed 58 sec used 2 sec Action statistics: Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0) backlog 0b 0p requeues 0 v3: Modified flower_port_range_attr_type calls. v2: Addressed Jiri's comment to sync output format with input Signed-off-by: Amritha Nambiar Acked-by: Jiri Pirko Signed-off-by: David Ahern --- tc/f_flower.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 133 insertions(+), 10 deletions(-) diff --git a/tc/f_flower.c b/tc/f_flower.c index 65fca043..9bddf7be 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -494,6 +494,68 @@ static int flower_parse_port(char *str, __u8 ip_proto, return 0; } +static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint type, + __be16 *min_port_type, + __be16 *max_port_type) +{ + if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP || + ip_proto == IPPROTO_SCTP) { + if (type == FLOWER_ENDPOINT_SRC) { + *min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN; + *max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX; + } else { + *min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN; + *max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX; + } + } else { + return -1; + } + + return 0; +} + +static int flower_parse_port_range(__be16 *min, __be16 *max, __u8 ip_proto, + enum flower_endpoint endpoint, + struct nlmsghdr *n) +{ + __be16 min_port_type, max_port_type; + + if (flower_port_range_attr_type(ip_proto, endpoint, &min_port_type, + &max_port_type)) + return -1; + + addattr16(n, MAX_MSG, min_port_type, *min); + addattr16(n, MAX_MSG, max_port_type, *max); + + return 0; +} + +static int get_range(__be16 *min, __be16 *max, char *argv) +{ + char *r; + + r = strchr(argv, '-'); + if (r) { + *r = '\0'; + if (get_be16(min, argv, 10)) { + fprintf(stderr, "invalid min range\n"); + return -1; + } + if (get_be16(max, r + 1, 10)) { + fprintf(stderr, "invalid max range\n"); + return -1; + } + if (htons(*max) <= htons(*min)) { + fprintf(stderr, "max value should be greater than min value\n"); + return -1; + } + } else { + fprintf(stderr, "Illegal range format\n"); + return -1; + } + return 0; +} + #define TCP_FLAGS_MAX_MASK 0xfff static int flower_parse_tcp_flags(char *str, int flags_type, int mask_type, @@ -1061,20 +1123,54 @@ static int flower_parse_opt(struct filter_util *qu, char *handle, return -1; } } else if (matches(*argv, "dst_port") == 0) { + __be16 min, max; + NEXT_ARG(); - ret = flower_parse_port(*argv, ip_proto, - FLOWER_ENDPOINT_DST, n); - if (ret < 0) { - fprintf(stderr, "Illegal \"dst_port\"\n"); - return -1; + if (matches(*argv, "range") == 0) { + NEXT_ARG(); + ret = get_range(&min, &max, *argv); + if (ret < 0) + return -1; + ret = flower_parse_port_range(&min, &max, + ip_proto, + FLOWER_ENDPOINT_DST, + n); + if (ret < 0) { + fprintf(stderr, "Illegal \"dst_port range\"\n"); + return -1; + } + } else { + ret = flower_parse_port(*argv, ip_proto, + FLOWER_ENDPOINT_DST, n); + if (ret < 0) { + fprintf(stderr, "Illegal \"dst_port\"\n"); + return -1; + } } } else if (matches(*argv, "src_port") == 0) { + __be16 min, max; + NEXT_ARG(); - ret = flower_parse_port(*argv, ip_proto, - FLOWER_ENDPOINT_SRC, n); - if (ret < 0) { - fprintf(stderr, "Illegal \"src_port\"\n"); - return -1; + if (matches(*argv, "range") == 0) { + NEXT_ARG(); + ret = get_range(&min, &max, *argv); + if (ret < 0) + return -1; + ret = flower_parse_port_range(&min, &max, + ip_proto, + FLOWER_ENDPOINT_SRC, + n); + if (ret < 0) { + fprintf(stderr, "Illegal \"src_port range\"\n"); + return -1; + } + } else { + ret = flower_parse_port(*argv, ip_proto, + FLOWER_ENDPOINT_SRC, n); + if (ret < 0) { + fprintf(stderr, "Illegal \"src_port\"\n"); + return -1; + } } } else if (matches(*argv, "tcp_flags") == 0) { NEXT_ARG(); @@ -1490,6 +1586,22 @@ static void flower_print_port(char *name, struct rtattr *attr) print_hu(PRINT_ANY, name, namefrm, rta_getattr_be16(attr)); } +static void flower_print_port_range(char *name, struct rtattr *min_attr, + struct rtattr *max_attr) +{ + SPRINT_BUF(namefrm); + SPRINT_BUF(out); + size_t done; + + if (!min_attr || !max_attr) + return; + + done = sprintf(out, "%u", rta_getattr_be16(min_attr)); + sprintf(out + done, "-%u", rta_getattr_be16(max_attr)); + sprintf(namefrm, "\n %s %%s", name); + print_string(PRINT_ANY, name, namefrm, out); +} + static void flower_print_tcp_flags(const char *name, struct rtattr *flags_attr, struct rtattr *mask_attr) { @@ -1678,6 +1790,7 @@ static int flower_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle) { struct rtattr *tb[TCA_FLOWER_MAX + 1]; + __be16 min_port_type, max_port_type; int nl_type, nl_mask_type; __be16 eth_type = 0; __u8 ip_proto = 0xff; @@ -1796,6 +1909,16 @@ static int flower_print_opt(struct filter_util *qu, FILE *f, if (nl_type >= 0) flower_print_port("src_port", tb[nl_type]); + if (!flower_port_range_attr_type(ip_proto, FLOWER_ENDPOINT_DST, + &min_port_type, &max_port_type)) + flower_print_port_range("dst_port range", + tb[min_port_type], tb[max_port_type]); + + if (!flower_port_range_attr_type(ip_proto, FLOWER_ENDPOINT_SRC, + &min_port_type, &max_port_type)) + flower_print_port_range("src_port range", + tb[min_port_type], tb[max_port_type]); + flower_print_tcp_flags("tcp_flags", tb[TCA_FLOWER_KEY_TCP_FLAGS], tb[TCA_FLOWER_KEY_TCP_FLAGS_MASK]); From 6ae54b13266b4eebd77509f1d8508af867aadeae Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 24 Nov 2018 07:06:17 -0800 Subject: [PATCH 10/50] Revert "rdma: make local functions static" This reverts commit e99c4443ae1d582950a2207067368a59152bbd77. Patch added to iproute2-master breaks builds of -next because of a more recent patch in -next that relies on the exports. Revert the offending patch. Unfortunately this leaves a window where builds break. Signed-off-by: David Ahern --- rdma/rdma.h | 11 +++++++++++ rdma/utils.c | 12 ++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/rdma/rdma.h b/rdma/rdma.h index 42be9174..547bb574 100644 --- a/rdma/rdma.h +++ b/rdma/rdma.h @@ -74,6 +74,13 @@ struct rd_cmd { int (*func)(struct rd *rd); }; +/* + * Parser interface + */ +bool rd_no_arg(struct rd *rd); +void rd_arg_inc(struct rd *rd); + +char *rd_argv(struct rd *rd); /* * Commands interface @@ -89,6 +96,8 @@ void rd_free(struct rd *rd); int rd_set_arg_to_devname(struct rd *rd); int rd_argc(struct rd *rd); +int strcmpx(const char *str1, const char *str2); + /* * Device manipulation */ @@ -109,12 +118,14 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t seq); void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t flags); int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data); int rd_attr_cb(const struct nlattr *attr, void *data); +int rd_attr_check(const struct nlattr *attr, int *typep); /* * Print helpers */ void print_driver_table(struct rd *rd, struct nlattr *tb); void newline(struct rd *rd); +void newline_indent(struct rd *rd); #define MAX_LINE_LENGTH 80 #endif /* _RDMA_TOOL_H_ */ diff --git a/rdma/utils.c b/rdma/utils.c index 696b69a4..61f4aeb1 100644 --- a/rdma/utils.c +++ b/rdma/utils.c @@ -18,14 +18,14 @@ int rd_argc(struct rd *rd) return rd->argc; } -static char *rd_argv(struct rd *rd) +char *rd_argv(struct rd *rd) { if (!rd_argc(rd)) return NULL; return *rd->argv; } -static int strcmpx(const char *str1, const char *str2) +int strcmpx(const char *str1, const char *str2) { if (strlen(str1) > strlen(str2)) return -1; @@ -39,7 +39,7 @@ static bool rd_argv_match(struct rd *rd, const char *pattern) return strcmpx(rd_argv(rd), pattern) == 0; } -static void rd_arg_inc(struct rd *rd) +void rd_arg_inc(struct rd *rd) { if (!rd_argc(rd)) return; @@ -47,7 +47,7 @@ static void rd_arg_inc(struct rd *rd) rd->argv++; } -static bool rd_no_arg(struct rd *rd) +bool rd_no_arg(struct rd *rd) { return rd_argc(rd) == 0; } @@ -404,7 +404,7 @@ static const enum mnl_attr_data_type nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DRIVER_U64] = MNL_TYPE_U64, }; -static int rd_attr_check(const struct nlattr *attr, int *typep) +int rd_attr_check(const struct nlattr *attr, int *typep) { int type; @@ -706,7 +706,7 @@ void newline(struct rd *rd) pr_out("\n"); } -static void newline_indent(struct rd *rd) +void newline_indent(struct rd *rd) { newline(rd); if (!rd->json_output) From c8f201e3d2329a29f8f309dba1d4948d4ead3c58 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 19 Nov 2018 15:03:28 -0800 Subject: [PATCH 11/50] tc: gred: remove unclear comment The comment about providing a proper message seems similar to the comment in the kernel which says: /* hack -- fix at some point with proper message This is how we indicate to tc that there is no VQ at this DP */ it's unclear what that message would be, and whether it's needed. Remove the confusing comment. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David Ahern --- tc/q_gred.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tc/q_gred.c b/tc/q_gred.c index e63fac72..80a9ccbb 100644 --- a/tc/q_gred.c +++ b/tc/q_gred.c @@ -302,8 +302,6 @@ static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) return -1; } -/* Bad hack! should really return a proper message as shown above*/ - fprintf(f, "vqs %u default %u %s", sopt->DPs, sopt->def_DP, From b640e85d2d98d96549364130a45dce7300a3e0c9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 19 Nov 2018 15:03:29 -0800 Subject: [PATCH 12/50] json: add %hhu helpers Add helpers for printing char-size values. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David Ahern --- include/json_print.h | 1 + include/json_writer.h | 2 ++ lib/json_print.c | 1 + lib/json_writer.c | 11 +++++++++++ 4 files changed, 15 insertions(+) diff --git a/include/json_print.h b/include/json_print.h index 218da31a..25954070 100644 --- a/include/json_print.h +++ b/include/json_print.h @@ -64,6 +64,7 @@ _PRINT_FUNC(null, const char*); _PRINT_FUNC(string, const char*); _PRINT_FUNC(uint, unsigned int); _PRINT_FUNC(u64, uint64_t); +_PRINT_FUNC(hhu, unsigned char); _PRINT_FUNC(hu, unsigned short); _PRINT_FUNC(hex, unsigned int); _PRINT_FUNC(0xhex, unsigned long long int); diff --git a/include/json_writer.h b/include/json_writer.h index 0c8831c1..354c2754 100644 --- a/include/json_writer.h +++ b/include/json_writer.h @@ -38,6 +38,7 @@ void jsonw_float_fmt(json_writer_t *self, const char *fmt, double num); void jsonw_uint(json_writer_t *self, unsigned int number); void jsonw_u64(json_writer_t *self, uint64_t number); void jsonw_xint(json_writer_t *self, uint64_t number); +void jsonw_hhu(json_writer_t *self, unsigned char num); void jsonw_hu(json_writer_t *self, unsigned short number); void jsonw_int(json_writer_t *self, int number); void jsonw_s64(json_writer_t *self, int64_t number); @@ -52,6 +53,7 @@ void jsonw_float_field(json_writer_t *self, const char *prop, double num); void jsonw_uint_field(json_writer_t *self, const char *prop, unsigned int num); void jsonw_u64_field(json_writer_t *self, const char *prop, uint64_t num); void jsonw_xint_field(json_writer_t *self, const char *prop, uint64_t num); +void jsonw_hhu_field(json_writer_t *self, const char *prop, unsigned char num); void jsonw_hu_field(json_writer_t *self, const char *prop, unsigned short num); void jsonw_int_field(json_writer_t *self, const char *prop, int num); void jsonw_s64_field(json_writer_t *self, const char *prop, int64_t num); diff --git a/lib/json_print.c b/lib/json_print.c index f7ef41c1..4f5fef19 100644 --- a/lib/json_print.c +++ b/lib/json_print.c @@ -118,6 +118,7 @@ void close_json_array(enum output_type type, const char *str) } _PRINT_FUNC(int, int); _PRINT_FUNC(s64, int64_t); +_PRINT_FUNC(hhu, unsigned char); _PRINT_FUNC(hu, unsigned short); _PRINT_FUNC(uint, unsigned int); _PRINT_FUNC(u64, uint64_t); diff --git a/lib/json_writer.c b/lib/json_writer.c index 68890b34..46eff6ad 100644 --- a/lib/json_writer.c +++ b/lib/json_writer.c @@ -211,6 +211,11 @@ void jsonw_float(json_writer_t *self, double num) jsonw_printf(self, "%g", num); } +void jsonw_hhu(json_writer_t *self, unsigned char num) +{ + jsonw_printf(self, "%hhu", num); +} + void jsonw_hu(json_writer_t *self, unsigned short num) { jsonw_printf(self, "%hu", num); @@ -288,6 +293,12 @@ void jsonw_xint_field(json_writer_t *self, const char *prop, uint64_t num) jsonw_xint(self, num); } +void jsonw_hhu_field(json_writer_t *self, const char *prop, unsigned char num) +{ + jsonw_name(self, prop); + jsonw_hhu(self, num); +} + void jsonw_hu_field(json_writer_t *self, const char *prop, unsigned short num) { jsonw_name(self, prop); From 33021752cd85f85086c8a9352225eecccd6bb85f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 19 Nov 2018 15:03:30 -0800 Subject: [PATCH 13/50] tc: move RED flag printing to helper Number of qdiscs use the same set of flags to control shared RED implementation. Add a helper for printing those flags. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David Ahern --- tc/q_choke.c | 3 +-- tc/q_red.c | 14 ++------------ tc/q_sfq.c | 3 +-- tc/tc_red.c | 20 ++++++++++++++++++++ tc/tc_red.h | 1 + 5 files changed, 25 insertions(+), 16 deletions(-) diff --git a/tc/q_choke.c b/tc/q_choke.c index b269b133..1353c80c 100644 --- a/tc/q_choke.c +++ b/tc/q_choke.c @@ -188,8 +188,7 @@ static int choke_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) fprintf(f, "limit %up min %up max %up ", qopt->limit, qopt->qth_min, qopt->qth_max); - if (qopt->flags & TC_RED_ECN) - fprintf(f, "ecn "); + tc_red_print_flags(qopt->flags); if (show_details) { fprintf(f, "ewma %u ", qopt->Wlog); diff --git a/tc/q_red.c b/tc/q_red.c index 49fd4ac8..3b3a1204 100644 --- a/tc/q_red.c +++ b/tc/q_red.c @@ -189,18 +189,8 @@ static int red_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) print_uint(PRINT_JSON, "max", NULL, qopt->qth_max); print_string(PRINT_FP, NULL, "max %s ", sprint_size(qopt->qth_max, b3)); - if (qopt->flags & TC_RED_ECN) - print_bool(PRINT_ANY, "ecn", "ecn ", true); - else - print_bool(PRINT_ANY, "ecn", NULL, false); - if (qopt->flags & TC_RED_HARDDROP) - print_bool(PRINT_ANY, "harddrop", "harddrop ", true); - else - print_bool(PRINT_ANY, "harddrop", NULL, false); - if (qopt->flags & TC_RED_ADAPTATIVE) - print_bool(PRINT_ANY, "adaptive", "adaptive ", true); - else - print_bool(PRINT_ANY, "adaptive", NULL, false); + tc_red_print_flags(qopt->flags); + if (show_details) { print_uint(PRINT_ANY, "ewma", "ewma %u ", qopt->Wlog); if (max_P) diff --git a/tc/q_sfq.c b/tc/q_sfq.c index 6a1d853b..eee31ec5 100644 --- a/tc/q_sfq.c +++ b/tc/q_sfq.c @@ -235,8 +235,7 @@ static int sfq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) sprint_size(qopt_ext->qth_min, b2), sprint_size(qopt_ext->qth_max, b3), qopt_ext->max_P / pow(2, 32)); - if (qopt_ext->flags & TC_RED_ECN) - fprintf(f, "ecn "); + tc_red_print_flags(qopt_ext->flags); if (show_stats) { fprintf(f, "\n prob_mark %u prob_mark_head %u prob_drop %u", qopt_ext->stats.prob_mark, diff --git a/tc/tc_red.c b/tc/tc_red.c index 178fe088..3ce3ca42 100644 --- a/tc/tc_red.c +++ b/tc/tc_red.c @@ -20,7 +20,9 @@ #include #include +#include "utils.h" #include "tc_core.h" +#include "tc_util.h" #include "tc_red.h" /* @@ -97,3 +99,21 @@ int tc_red_eval_idle_damping(int Wlog, unsigned int avpkt, unsigned int bps, __u sbuf[255] = 31; return clog; } + +void tc_red_print_flags(__u32 flags) +{ + if (flags & TC_RED_ECN) + print_bool(PRINT_ANY, "ecn", "ecn ", true); + else + print_bool(PRINT_ANY, "ecn", NULL, false); + + if (flags & TC_RED_HARDDROP) + print_bool(PRINT_ANY, "harddrop", "harddrop ", true); + else + print_bool(PRINT_ANY, "harddrop", NULL, false); + + if (flags & TC_RED_ADAPTATIVE) + print_bool(PRINT_ANY, "adaptive", "adaptive ", true); + else + print_bool(PRINT_ANY, "adaptive", NULL, false); +} diff --git a/tc/tc_red.h b/tc/tc_red.h index 6c6e6b03..3882c831 100644 --- a/tc/tc_red.h +++ b/tc/tc_red.h @@ -6,5 +6,6 @@ int tc_red_eval_P(unsigned qmin, unsigned qmax, double prob); int tc_red_eval_ewma(unsigned qmin, unsigned burst, unsigned avpkt); int tc_red_eval_idle_damping(int wlog, unsigned avpkt, unsigned bandwidth, __u8 *sbuf); +void tc_red_print_flags(__u32 flags); #endif From 6475e6a5800bf2e1f5da3abc4d7f07dd906cbc26 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 19 Nov 2018 15:03:31 -0800 Subject: [PATCH 14/50] tc: gred: jsonify GRED output Make GRED dump JSON-compatible. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David Ahern --- tc/q_gred.c | 105 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 74 insertions(+), 31 deletions(-) diff --git a/tc/q_gred.c b/tc/q_gred.c index 80a9ccbb..768b77ba 100644 --- a/tc/q_gred.c +++ b/tc/q_gred.c @@ -275,8 +275,6 @@ static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) unsigned int i; SPRINT_BUF(b1); - SPRINT_BUF(b2); - SPRINT_BUF(b3); if (opt == NULL) return 0; @@ -302,45 +300,90 @@ static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) return -1; } - fprintf(f, "vqs %u default %u %s", - sopt->DPs, - sopt->def_DP, - sopt->grio ? "grio " : ""); + print_uint(PRINT_ANY, "dp_cnt", "vqs %u ", sopt->DPs); + print_uint(PRINT_ANY, "dp_default", "default %u ", sopt->def_DP); - if (limit) - fprintf(f, "limit %s ", - sprint_size(*limit, b1)); + if (sopt->grio) + print_bool(PRINT_ANY, "grio", "grio ", true); + else + print_bool(PRINT_ANY, "grio", NULL, false); + if (limit) { + print_uint(PRINT_JSON, "limit", NULL, *limit); + print_string(PRINT_FP, NULL, "limit %s ", + sprint_size(*limit, b1)); + } + + open_json_array(PRINT_JSON, "vqs"); for (i = 0; i < MAX_DPs; i++, qopt++) { - if (qopt->DP >= MAX_DPs) continue; - fprintf(f, "\n vq %u prio %hhu limit %s min %s max %s ", - qopt->DP, - qopt->prio, - sprint_size(qopt->limit, b1), - sprint_size(qopt->qth_min, b2), - sprint_size(qopt->qth_max, b3)); + if (qopt->DP >= MAX_DPs) + continue; + + open_json_object(NULL); + + print_uint(PRINT_ANY, "vq", "\n vq %u ", qopt->DP); + print_hhu(PRINT_ANY, "prio", "prio %hhu ", qopt->prio); + + print_uint(PRINT_JSON, "limit", NULL, qopt->limit); + print_string(PRINT_FP, NULL, "limit %s ", + sprint_size(qopt->limit, b1)); + + print_uint(PRINT_JSON, "min", NULL, qopt->qth_min); + print_string(PRINT_FP, NULL, "min %s ", + sprint_size(qopt->qth_min, b1)); + + print_uint(PRINT_JSON, "max", NULL, qopt->qth_max); + print_string(PRINT_FP, NULL, "max %s ", + sprint_size(qopt->qth_max, b1)); + if (show_details) { - fprintf(f, "ewma %u ", qopt->Wlog); + print_uint(PRINT_ANY, "ewma", "ewma %u ", qopt->Wlog); if (max_p) - fprintf(f, "probability %lg ", max_p[i] / pow(2, 32)); + print_float(PRINT_ANY, "probability", + "probability %lg ", + max_p[i] / pow(2, 32)); else - fprintf(f, "Plog %u ", qopt->Plog); - fprintf(f, "Scell_log %u ", qopt->Scell_log); + print_uint(PRINT_ANY, "Plog", "Plog %u ", + qopt->Plog); + print_uint(PRINT_ANY, "Scell_log", "Scell_log %u ", + qopt->Scell_log); } if (show_stats) { - fprintf(f, "\n Queue size: average %s current %s ", - sprint_size(qopt->qave, b1), - sprint_size(qopt->backlog, b2)); - fprintf(f, "\n Dropped packets: forced %u early %u pdrop %u other %u ", - qopt->forced, - qopt->early, - qopt->pdrop, - qopt->other); - fprintf(f, "\n Total packets: %u (%s) ", - qopt->packets, - sprint_size(qopt->bytesin, b1)); + if (!is_json_context()) + printf("\n Queue size: "); + + print_uint(PRINT_JSON, "qave", NULL, qopt->qave); + print_string(PRINT_FP, NULL, "average %s ", + sprint_size(qopt->qave, b1)); + + print_uint(PRINT_JSON, "backlog", NULL, qopt->backlog); + print_string(PRINT_FP, NULL, "current %s ", + sprint_size(qopt->backlog, b1)); + + if (!is_json_context()) + printf("\n Dropped packets: "); + + print_uint(PRINT_ANY, "forced_drop", "forced %u ", + qopt->forced); + print_uint(PRINT_ANY, "prob_drop", "early %u ", + qopt->early); + print_uint(PRINT_ANY, "pdrop", "pdrop %u ", + qopt->pdrop); + print_uint(PRINT_ANY, "other", "other %u ", + qopt->other); + + if (!is_json_context()) + printf("\n Total packets: "); + + print_uint(PRINT_ANY, "packets", "%u ", qopt->packets); + + print_uint(PRINT_JSON, "bytes", NULL, qopt->bytesin); + print_string(PRINT_FP, NULL, "(%s) ", + sprint_size(qopt->bytesin, b1)); } + close_json_object(); } + close_json_array(PRINT_JSON, "vqs"); return 0; } From c3e1cd28c195c2dc9ba230b11c516e5ca804562e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 19 Nov 2018 15:03:32 -0800 Subject: [PATCH 15/50] tc: gred: separate out stats printing Printing GRED statistics is long and deserves a function on its own. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David Ahern --- tc/q_gred.c | 67 +++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/tc/q_gred.c b/tc/q_gred.c index 768b77ba..501437bc 100644 --- a/tc/q_gred.c +++ b/tc/q_gred.c @@ -265,6 +265,38 @@ static int gred_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct n return 0; } +static void gred_print_stats(struct tc_gred_qopt *qopt) +{ + SPRINT_BUF(b1); + + if (!is_json_context()) + printf("\n Queue size: "); + + print_uint(PRINT_JSON, "qave", NULL, qopt->qave); + print_string(PRINT_FP, NULL, "average %s ", + sprint_size(qopt->qave, b1)); + + print_uint(PRINT_JSON, "backlog", NULL, qopt->backlog); + print_string(PRINT_FP, NULL, "current %s ", + sprint_size(qopt->backlog, b1)); + + if (!is_json_context()) + printf("\n Dropped packets: "); + + print_uint(PRINT_ANY, "forced_drop", "forced %u ", qopt->forced); + print_uint(PRINT_ANY, "prob_drop", "early %u ", qopt->early); + print_uint(PRINT_ANY, "pdrop", "pdrop %u ", qopt->pdrop); + print_uint(PRINT_ANY, "other", "other %u ", qopt->other); + + if (!is_json_context()) + printf("\n Total packets: "); + + print_uint(PRINT_ANY, "packets", "%u ", qopt->packets); + + print_uint(PRINT_JSON, "bytes", NULL, qopt->bytesin); + print_string(PRINT_FP, NULL, "(%s) ", sprint_size(qopt->bytesin, b1)); +} + static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) { struct rtattr *tb[TCA_GRED_MAX + 1]; @@ -348,39 +380,8 @@ static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) print_uint(PRINT_ANY, "Scell_log", "Scell_log %u ", qopt->Scell_log); } - if (show_stats) { - if (!is_json_context()) - printf("\n Queue size: "); - - print_uint(PRINT_JSON, "qave", NULL, qopt->qave); - print_string(PRINT_FP, NULL, "average %s ", - sprint_size(qopt->qave, b1)); - - print_uint(PRINT_JSON, "backlog", NULL, qopt->backlog); - print_string(PRINT_FP, NULL, "current %s ", - sprint_size(qopt->backlog, b1)); - - if (!is_json_context()) - printf("\n Dropped packets: "); - - print_uint(PRINT_ANY, "forced_drop", "forced %u ", - qopt->forced); - print_uint(PRINT_ANY, "prob_drop", "early %u ", - qopt->early); - print_uint(PRINT_ANY, "pdrop", "pdrop %u ", - qopt->pdrop); - print_uint(PRINT_ANY, "other", "other %u ", - qopt->other); - - if (!is_json_context()) - printf("\n Total packets: "); - - print_uint(PRINT_ANY, "packets", "%u ", qopt->packets); - - print_uint(PRINT_JSON, "bytes", NULL, qopt->bytesin); - print_string(PRINT_FP, NULL, "(%s) ", - sprint_size(qopt->bytesin, b1)); - } + if (show_stats) + gred_print_stats(qopt); close_json_object(); } close_json_array(PRINT_JSON, "vqs"); From fdaff63c6a4eaa874bd79af458e8093ac4dd6672 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 19 Nov 2018 15:03:33 -0800 Subject: [PATCH 16/50] tc: gred: use extended stats if available Use the extended attributes with extra and better stats, when possible. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David Ahern --- tc/q_gred.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 114 insertions(+), 8 deletions(-) diff --git a/tc/q_gred.c b/tc/q_gred.c index 501437bc..fda41a57 100644 --- a/tc/q_gred.c +++ b/tc/q_gred.c @@ -265,8 +265,90 @@ static int gred_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct n return 0; } -static void gred_print_stats(struct tc_gred_qopt *qopt) +struct tc_gred_info { + __u64 bytes; + __u32 packets; + __u32 backlog; + __u32 prob_drop; + __u32 prob_mark; + __u32 forced_drop; + __u32 forced_mark; + __u32 pdrop; + __u32 other; +}; + +static void +gred_parse_vqs(struct tc_gred_info *info, struct rtattr *vqs) { + int rem = RTA_PAYLOAD(vqs); + unsigned int offset = 0; + + while (rem > offset) { + struct rtattr *tb_entry[TCA_GRED_VQ_ENTRY_MAX + 1] = {}; + struct rtattr *tb[TCA_GRED_VQ_MAX + 1] = {}; + struct rtattr *entry; + unsigned int len; + unsigned int dp; + + entry = RTA_DATA(vqs) + offset; + + parse_rtattr(tb_entry, TCA_GRED_VQ_ENTRY_MAX, entry, + rem - offset); + len = RTA_LENGTH(RTA_PAYLOAD(entry)); + offset += len; + + if (!tb_entry[TCA_GRED_VQ_ENTRY]) { + fprintf(stderr, + "ERROR: Failed to parse Virtual Queue entry\n"); + continue; + } + + parse_rtattr_nested(tb, TCA_GRED_VQ_MAX, + tb_entry[TCA_GRED_VQ_ENTRY]); + + if (!tb[TCA_GRED_VQ_DP]) { + fprintf(stderr, + "ERROR: Virtual Queue without DP attribute\n"); + continue; + } + + dp = rta_getattr_u32(tb[TCA_GRED_VQ_DP]); + + if (tb[TCA_GRED_VQ_STAT_BYTES]) + info[dp].bytes = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_BYTES]); + if (tb[TCA_GRED_VQ_STAT_PACKETS]) + info[dp].packets = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_PACKETS]); + if (tb[TCA_GRED_VQ_STAT_BACKLOG]) + info[dp].backlog = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_BACKLOG]); + if (tb[TCA_GRED_VQ_STAT_PROB_DROP]) + info[dp].prob_drop = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_PROB_DROP]); + if (tb[TCA_GRED_VQ_STAT_PROB_MARK]) + info[dp].prob_mark = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_PROB_MARK]); + if (tb[TCA_GRED_VQ_STAT_FORCED_DROP]) + info[dp].forced_drop = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_FORCED_DROP]); + if (tb[TCA_GRED_VQ_STAT_FORCED_MARK]) + info[dp].forced_mark = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_FORCED_MARK]); + if (tb[TCA_GRED_VQ_STAT_PDROP]) + info[dp].pdrop = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_PDROP]); + if (tb[TCA_GRED_VQ_STAT_OTHER]) + info[dp].other = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_OTHER]); + } +} + +static void +gred_print_stats(struct tc_gred_info *info, struct tc_gred_qopt *qopt) +{ + __u64 bytes = info ? info->bytes : qopt->bytesin; + SPRINT_BUF(b1); if (!is_json_context()) @@ -283,25 +365,44 @@ static void gred_print_stats(struct tc_gred_qopt *qopt) if (!is_json_context()) printf("\n Dropped packets: "); - print_uint(PRINT_ANY, "forced_drop", "forced %u ", qopt->forced); - print_uint(PRINT_ANY, "prob_drop", "early %u ", qopt->early); - print_uint(PRINT_ANY, "pdrop", "pdrop %u ", qopt->pdrop); - print_uint(PRINT_ANY, "other", "other %u ", qopt->other); + if (info) { + print_uint(PRINT_ANY, "forced_drop", "forced %u ", + info->forced_drop); + print_uint(PRINT_ANY, "prob_drop", "early %u ", + info->prob_drop); + print_uint(PRINT_ANY, "pdrop", "pdrop %u ", info->pdrop); + print_uint(PRINT_ANY, "other", "other %u ", info->other); + + if (!is_json_context()) + printf("\n Marked packets: "); + print_uint(PRINT_ANY, "forced_mark", "forced %u ", + info->forced_mark); + print_uint(PRINT_ANY, "prob_mark", "early %u ", + info->prob_mark); + } else { + print_uint(PRINT_ANY, "forced_drop", "forced %u ", + qopt->forced); + print_uint(PRINT_ANY, "prob_drop", "early %u ", qopt->early); + print_uint(PRINT_ANY, "pdrop", "pdrop %u ", qopt->pdrop); + print_uint(PRINT_ANY, "other", "other %u ", qopt->other); + } if (!is_json_context()) printf("\n Total packets: "); print_uint(PRINT_ANY, "packets", "%u ", qopt->packets); - print_uint(PRINT_JSON, "bytes", NULL, qopt->bytesin); - print_string(PRINT_FP, NULL, "(%s) ", sprint_size(qopt->bytesin, b1)); + print_uint(PRINT_JSON, "bytes", NULL, bytes); + print_string(PRINT_FP, NULL, "(%s) ", sprint_size(bytes, b1)); } static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) { + struct tc_gred_info infos[MAX_DPs] = {}; struct rtattr *tb[TCA_GRED_MAX + 1]; struct tc_gred_sopt *sopt; struct tc_gred_qopt *qopt; + bool vq_info = false; __u32 *max_p = NULL; __u32 *limit = NULL; unsigned int i; @@ -332,6 +433,11 @@ static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) return -1; } + if (tb[TCA_GRED_VQ_LIST] && show_stats) { + gred_parse_vqs(infos, tb[TCA_GRED_VQ_LIST]); + vq_info = true; + } + print_uint(PRINT_ANY, "dp_cnt", "vqs %u ", sopt->DPs); print_uint(PRINT_ANY, "dp_default", "default %u ", sopt->def_DP); @@ -381,7 +487,7 @@ static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) qopt->Scell_log); } if (show_stats) - gred_print_stats(qopt); + gred_print_stats(vq_info ? &infos[i] : NULL, qopt); close_json_object(); } close_json_array(PRINT_JSON, "vqs"); From 2d7c564a1e632771f72b3a9bf46bbf3e45558089 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 19 Nov 2018 15:03:34 -0800 Subject: [PATCH 17/50] tc: gred: support controlling RED flags Kernel GRED qdisc supports ECN marking, and the harddrop flag but setting and dumping this flag is not possible with iproute2. Add the support. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David Ahern --- bash-completion/tc | 2 +- tc/q_gred.c | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bash-completion/tc b/bash-completion/tc index 29bca5d9..007e1c2e 100644 --- a/bash-completion/tc +++ b/bash-completion/tc @@ -302,7 +302,7 @@ _tc_qdisc_options() ;; gred) _tc_once_attr 'setup vqs default grio vq prio limit min max avpkt \ - burst probability bandwidth' + burst probability bandwidth ecn harddrop' return 0 ;; hhf) diff --git a/tc/q_gred.c b/tc/q_gred.c index fda41a57..dfa3252b 100644 --- a/tc/q_gred.c +++ b/tc/q_gred.c @@ -37,7 +37,7 @@ static void explain(void) { fprintf(stderr, "Usage: tc qdisc { add | replace | change } ... gred setup vqs NUMBER\n"); - fprintf(stderr, " default DEFAULT_VQ [ grio ] [ limit BYTES ]\n"); + fprintf(stderr, " default DEFAULT_VQ [ grio ] [ limit BYTES ] [ecn] [harddrop]\n"); fprintf(stderr, " tc qdisc change ... gred vq VQ [ prio VALUE ] limit BYTES\n"); fprintf(stderr, " min BYTES max BYTES avpkt BYTES [ burst PACKETS ]\n"); fprintf(stderr, " [ probability PROBABILITY ] [ bandwidth KBPS ]\n"); @@ -87,6 +87,10 @@ static int init_gred(struct qdisc_util *qu, int argc, char **argv, fprintf(stderr, "Illegal \"limit\"\n"); return -1; } + } else if (strcmp(*argv, "ecn") == 0) { + opt.flags |= TC_RED_ECN; + } else if (strcmp(*argv, "harddrop") == 0) { + opt.flags |= TC_RED_HARDDROP; } else if (strcmp(*argv, "help") == 0) { explain(); return -1; @@ -452,6 +456,8 @@ static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) sprint_size(*limit, b1)); } + tc_red_print_flags(sopt->flags); + open_json_array(PRINT_JSON, "vqs"); for (i = 0; i < MAX_DPs; i++, qopt++) { if (qopt->DP >= MAX_DPs) From f7a8749affbe4a8f542e8f76080db1a64c64e594 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 19 Nov 2018 15:03:35 -0800 Subject: [PATCH 18/50] tc: gred: allow controlling and dumping per-DP RED flags Kernel now support setting ECN and HARDDROP flags per-virtual queue. Allow users to tweak the settings, and print them on dump. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David Ahern --- tc/q_gred.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/tc/q_gred.c b/tc/q_gred.c index dfa3252b..e297b866 100644 --- a/tc/q_gred.c +++ b/tc/q_gred.c @@ -40,7 +40,7 @@ static void explain(void) fprintf(stderr, " default DEFAULT_VQ [ grio ] [ limit BYTES ] [ecn] [harddrop]\n"); fprintf(stderr, " tc qdisc change ... gred vq VQ [ prio VALUE ] limit BYTES\n"); fprintf(stderr, " min BYTES max BYTES avpkt BYTES [ burst PACKETS ]\n"); - fprintf(stderr, " [ probability PROBABILITY ] [ bandwidth KBPS ]\n"); + fprintf(stderr, " [ probability PROBABILITY ] [ bandwidth KBPS ] [ecn] [harddrop]\n"); } static int init_gred(struct qdisc_util *qu, int argc, char **argv, @@ -121,15 +121,16 @@ static int init_gred(struct qdisc_util *qu, int argc, char **argv, */ static int gred_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n, const char *dev) { + struct rtattr *tail, *entry, *vqs; int ok = 0; struct tc_gred_qopt opt = { 0 }; unsigned int burst = 0; unsigned int avpkt = 0; + unsigned int flags = 0; double probability = 0.02; unsigned int rate = 0; int parm; __u8 sbuf[256]; - struct rtattr *tail; __u32 max_P; opt.DP = MAX_DPs; @@ -212,6 +213,10 @@ static int gred_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct n return -1; } ok++; + } else if (strcmp(*argv, "ecn") == 0) { + flags |= TC_RED_ECN; + } else if (strcmp(*argv, "harddrop") == 0) { + flags |= TC_RED_HARDDROP; } else if (strcmp(*argv, "help") == 0) { explain(); return -1; @@ -265,11 +270,20 @@ static int gred_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct n addattr_l(n, 1024, TCA_GRED_STAB, sbuf, 256); max_P = probability * pow(2, 32); addattr32(n, 1024, TCA_GRED_MAX_P, max_P); + + vqs = addattr_nest(n, 1024, TCA_GRED_VQ_LIST); + entry = addattr_nest(n, 1024, TCA_GRED_VQ_ENTRY); + addattr32(n, 1024, TCA_GRED_VQ_DP, opt.DP); + addattr32(n, 1024, TCA_GRED_VQ_FLAGS, flags); + addattr_nest_end(n, entry); + addattr_nest_end(n, vqs); + addattr_nest_end(n, tail); return 0; } struct tc_gred_info { + bool flags_present; __u64 bytes; __u32 packets; __u32 backlog; @@ -279,6 +293,7 @@ struct tc_gred_info { __u32 forced_mark; __u32 pdrop; __u32 other; + __u32 flags; }; static void @@ -345,6 +360,10 @@ gred_parse_vqs(struct tc_gred_info *info, struct rtattr *vqs) if (tb[TCA_GRED_VQ_STAT_OTHER]) info[dp].other = rta_getattr_u32(tb[TCA_GRED_VQ_STAT_OTHER]); + info[dp].flags_present = !!tb[TCA_GRED_VQ_FLAGS]; + if (tb[TCA_GRED_VQ_FLAGS]) + info[dp].flags = + rta_getattr_u32(tb[TCA_GRED_VQ_FLAGS]); } } @@ -437,7 +456,7 @@ static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) return -1; } - if (tb[TCA_GRED_VQ_LIST] && show_stats) { + if (tb[TCA_GRED_VQ_LIST]) { gred_parse_vqs(infos, tb[TCA_GRED_VQ_LIST]); vq_info = true; } @@ -480,6 +499,9 @@ static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) print_string(PRINT_FP, NULL, "max %s ", sprint_size(qopt->qth_max, b1)); + if (infos[i].flags_present) + tc_red_print_flags(infos[i].flags); + if (show_details) { print_uint(PRINT_ANY, "ewma", "ewma %u ", qopt->Wlog); if (max_p) From ce5071eda617c60f075935d2acd0d9c4ace311c2 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 15 Nov 2018 11:26:13 -0800 Subject: [PATCH 19/50] drop support for IPX IPX has been depracted then removed from upstream kernels. Drop support from ip route as well. Signed-off-by: Stephen Hemminger Signed-off-by: David Ahern --- Makefile | 3 -- include/utils.h | 10 ----- ip/ip.c | 4 +- ip/iproute.c | 2 +- lib/ipx_ntop.c | 71 ------------------------------- lib/ipx_pton.c | 97 ------------------------------------------ lib/utils.c | 2 - man/man8/ip-route.8.in | 2 +- man/man8/ip.8 | 9 +--- 9 files changed, 5 insertions(+), 195 deletions(-) delete mode 100644 lib/ipx_ntop.c delete mode 100644 lib/ipx_pton.c diff --git a/Makefile b/Makefile index b7488add..7d62468c 100644 --- a/Makefile +++ b/Makefile @@ -43,9 +43,6 @@ DEFINES+=-DCONFDIR=\"$(CONFDIR)\" \ #options for decnet ADDLIB+=dnet_ntop.o dnet_pton.o -#options for ipx -ADDLIB+=ipx_ntop.o ipx_pton.o - #options for mpls ADDLIB+=mpls_ntop.o mpls_pton.o diff --git a/include/utils.h b/include/utils.h index 1630dd0b..fee7ff28 100644 --- a/include/utils.h +++ b/include/utils.h @@ -116,13 +116,6 @@ struct dn_naddr unsigned char a_addr[DN_MAXADDL]; }; -#define IPX_NODE_LEN 6 - -struct ipx_addr { - u_int32_t ipx_net; - u_int8_t ipx_node[IPX_NODE_LEN]; -}; - #ifndef AF_MPLS # define AF_MPLS 28 #endif @@ -204,9 +197,6 @@ int inet_addr_match_rta(const inet_prefix *m, const struct rtattr *rta); const char *dnet_ntop(int af, const void *addr, char *str, size_t len); int dnet_pton(int af, const char *src, void *addr); -const char *ipx_ntop(int af, const void *addr, char *str, size_t len); -int ipx_pton(int af, const char *src, void *addr); - const char *mpls_ntop(int af, const void *addr, char *str, size_t len); int mpls_pton(int af, const char *src, void *addr, size_t alen); diff --git a/ip/ip.c b/ip/ip.c index c324120f..11dbed72 100644 --- a/ip/ip.c +++ b/ip/ip.c @@ -53,7 +53,7 @@ static void usage(void) " vrf | sr }\n" " OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n" " -h[uman-readable] | -iec | -j[son] | -p[retty] |\n" -" -f[amily] { inet | inet6 | ipx | dnet | mpls | bridge | link } |\n" +" -f[amily] { inet | inet6 | dnet | mpls | bridge | link } |\n" " -4 | -6 | -I | -D | -M | -B | -0 |\n" " -l[oops] { maximum-addr-flush-attempts } | -br[ief] |\n" " -o[neline] | -t[imestamp] | -ts[hort] | -b[atch] [filename] |\n" @@ -225,8 +225,6 @@ int main(int argc, char **argv) preferred_family = AF_INET6; } else if (strcmp(opt, "-0") == 0) { preferred_family = AF_PACKET; - } else if (strcmp(opt, "-I") == 0) { - preferred_family = AF_IPX; } else if (strcmp(opt, "-D") == 0) { preferred_family = AF_DECnet; } else if (strcmp(opt, "-M") == 0) { diff --git a/ip/iproute.c b/ip/iproute.c index b039f35b..26f7cd89 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -83,7 +83,7 @@ static void usage(void) "INFO_SPEC := NH OPTIONS FLAGS [ nexthop NH ]...\n" "NH := [ encap ENCAPTYPE ENCAPHDR ] [ via [ FAMILY ] ADDRESS ]\n" " [ dev STRING ] [ weight NUMBER ] NHFLAGS\n" - "FAMILY := [ inet | inet6 | ipx | dnet | mpls | bridge | link ]\n" + "FAMILY := [ inet | inet6 | dnet | mpls | bridge | link ]\n" "OPTIONS := FLAGS [ mtu NUMBER ] [ advmss NUMBER ] [ as [ to ] ADDRESS ]\n" " [ rtt TIME ] [ rttvar TIME ] [ reordering NUMBER ]\n" " [ window NUMBER ] [ cwnd NUMBER ] [ initcwnd NUMBER ]\n" diff --git a/lib/ipx_ntop.c b/lib/ipx_ntop.c deleted file mode 100644 index 80b8a34e..00000000 --- a/lib/ipx_ntop.c +++ /dev/null @@ -1,71 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include -#include - -#include "utils.h" - -static __inline__ int do_digit(char *str, u_int32_t addr, u_int32_t scale, size_t *pos, size_t len) -{ - u_int32_t tmp = addr >> (scale * 4); - - if (*pos == len) - return 1; - - tmp &= 0x0f; - if (tmp > 9) - *str = tmp + 'A' - 10; - else - *str = tmp + '0'; - (*pos)++; - - return 0; -} - -static const char *ipx_ntop1(const struct ipx_addr *addr, char *str, size_t len) -{ - int i; - size_t pos = 0; - - if (len == 0) - return str; - - for(i = 7; i >= 0; i--) - if (do_digit(str + pos, ntohl(addr->ipx_net), i, &pos, len)) - return str; - - if (pos == len) - return str; - - *(str + pos) = '.'; - pos++; - - for(i = 0; i < 6; i++) { - if (do_digit(str + pos, addr->ipx_node[i], 1, &pos, len)) - return str; - if (do_digit(str + pos, addr->ipx_node[i], 0, &pos, len)) - return str; - } - - if (pos == len) - return str; - - *(str + pos) = 0; - - return str; -} - - -const char *ipx_ntop(int af, const void *addr, char *str, size_t len) -{ - switch(af) { - case AF_IPX: - errno = 0; - return ipx_ntop1((struct ipx_addr *)addr, str, len); - default: - errno = EAFNOSUPPORT; - } - - return NULL; -} diff --git a/lib/ipx_pton.c b/lib/ipx_pton.c deleted file mode 100644 index a97c1c1b..00000000 --- a/lib/ipx_pton.c +++ /dev/null @@ -1,97 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include -#include -#include - -#include "utils.h" - -static int ipx_getnet(u_int32_t *net, const char *str) -{ - int i; - u_int32_t tmp; - - for(i = 0; *str && (i < 8); i++) { - - if ((tmp = get_hex(*str)) == -1) { - if (*str == '.') - return 0; - else - return -1; - } - - str++; - (*net) <<= 4; - (*net) |= tmp; - } - - if (*str == 0) - return 0; - - return -1; -} - -static int ipx_getnode(u_int8_t *node, const char *str) -{ - int i; - u_int32_t tmp; - - for(i = 0; i < 6; i++) { - if ((tmp = get_hex(*str++)) == -1) - return -1; - node[i] = (u_int8_t)tmp; - node[i] <<= 4; - if ((tmp = get_hex(*str++)) == -1) - return -1; - node[i] |= (u_int8_t)tmp; - if (*str == ':') - str++; - } - - return 0; -} - -static int ipx_pton1(const char *src, struct ipx_addr *addr) -{ - char *sep = (char *)src; - int no_node = 0; - - memset(addr, 0, sizeof(struct ipx_addr)); - - while(*sep && (*sep != '.')) - sep++; - - if (*sep != '.') - no_node = 1; - - if (ipx_getnet(&addr->ipx_net, src)) - return 0; - - addr->ipx_net = htonl(addr->ipx_net); - - if (no_node) - return 1; - - if (ipx_getnode(addr->ipx_node, sep + 1)) - return 0; - - return 1; -} - -int ipx_pton(int af, const char *src, void *addr) -{ - int err; - - switch (af) { - case AF_IPX: - errno = 0; - err = ipx_pton1(src, (struct ipx_addr *)addr); - break; - default: - errno = EAFNOSUPPORT; - err = -1; - } - - return err; -} diff --git a/lib/utils.c b/lib/utils.c index 4965a575..22bc6c8d 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -1000,8 +1000,6 @@ const char *rt_addr_n2a_r(int af, int len, return inet_ntop(af, addr, buf, buflen); case AF_MPLS: return mpls_ntop(af, addr, buf, buflen); - case AF_IPX: - return ipx_ntop(af, addr, buf, buflen); case AF_DECnet: { struct dn_naddr dna = { 2, { 0, 0, } }; diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in index 11dd4ca7..e11fba92 100644 --- a/man/man8/ip-route.8.in +++ b/man/man8/ip-route.8.in @@ -107,7 +107,7 @@ replace " } " .ti -8 .IR FAMILY " := [ " -.BR inet " | " inet6 " | " ipx " | " dnet " | " mpls " | " bridge " | " link " ]" +.BR inet " | " inet6 " | " dnet " | " mpls " | " bridge " | " link " ]" .ti -8 .IR OPTIONS " := " FLAGS " [ " diff --git a/man/man8/ip.8 b/man/man8/ip.8 index 1d358879..16867efb 100644 --- a/man/man8/ip.8 +++ b/man/man8/ip.8 @@ -34,7 +34,7 @@ ip \- show / manipulate routing, network devices, interfaces and tunnels \fB\-r\fR[\fIesolve\fR] | \fB\-iec\fR | \fB\-f\fR[\fIamily\fR] { -.BR inet " | " inet6 " | " ipx " | " dnet " | " link " } | " +.BR inet " | " inet6 " | " dnet " | " link " } | " \fB-4\fR | \fB-6\fR | \fB-I\fR | @@ -94,7 +94,7 @@ Zero (0) means loop until all addresses are removed. .TP .BR "\-f" , " \-family " Specifies the protocol family to use. The protocol family identifier can be one of -.BR "inet" , " inet6" , " bridge" , " ipx" , " dnet" , " mpls" +.BR "inet" , " inet6" , " bridge" , " dnet" , " mpls" or .BR link . If this option is not present, @@ -130,11 +130,6 @@ shortcut for shortcut for .BR "\-family decnet" . -.TP -.B \-I -shortcut for -.BR "\-family ipx" . - .TP .B \-M shortcut for From 55e106c48053307fd86b8a39c71b09282f54551d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 23 Nov 2018 22:37:24 -0800 Subject: [PATCH 20/50] tc: fq: support ce_threshold attribute Kernel commit 48872c11b772 ("net_sched: sch_fq: add dctcp-like marking") added support for TCA_FQ_CE_THRESHOLD attribute. This patch adds iproute2 support for it. It also makes sure fq_print_xstats() can deal with smaller tc_fq_qd_stats structures given by older kernels. Usage : FQATTRS="ce_threshold 4ms" TXQS=8 for ETH in eth0 do tc qd del dev $ETH root 2>/dev/null tc qd add dev $ETH root handle 1: mq for i in `seq 1 $TXQS` do tc qd add dev $ETH parent 1:$i fq $FQATTRS done done Signed-off-by: Eric Dumazet Signed-off-by: David Ahern --- tc/q_fq.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/tc/q_fq.c b/tc/q_fq.c index f3dbf2ba..a4174380 100644 --- a/tc/q_fq.c +++ b/tc/q_fq.c @@ -56,6 +56,7 @@ static void explain(void) fprintf(stderr, " [ [no]pacing ] [ refill_delay TIME ]\n"); fprintf(stderr, " [ low_rate_threshold RATE ]\n"); fprintf(stderr, " [ orphan_mask MASK]\n"); + fprintf(stderr, " [ ce_threshold TIME ]\n"); } static unsigned int ilog2(unsigned int val) @@ -83,6 +84,7 @@ static int fq_parse_opt(struct qdisc_util *qu, int argc, char **argv, unsigned int defrate; unsigned int refill_delay; unsigned int orphan_mask; + unsigned int ce_threshold; bool set_plimit = false; bool set_flow_plimit = false; bool set_quantum = false; @@ -92,6 +94,7 @@ static int fq_parse_opt(struct qdisc_util *qu, int argc, char **argv, bool set_refill_delay = false; bool set_orphan_mask = false; bool set_low_rate_threshold = false; + bool set_ce_threshold = false; int pacing = -1; struct rtattr *tail; @@ -135,6 +138,13 @@ static int fq_parse_opt(struct qdisc_util *qu, int argc, char **argv, return -1; } set_low_rate_threshold = true; + } else if (strcmp(*argv, "ce_threshold") == 0) { + NEXT_ARG(); + if (get_time(&ce_threshold, *argv)) { + fprintf(stderr, "Illegal \"ce_threshold\"\n"); + return -1; + } + set_ce_threshold = true; } else if (strcmp(*argv, "defrate") == 0) { NEXT_ARG(); if (strchr(*argv, '%')) { @@ -226,6 +236,9 @@ static int fq_parse_opt(struct qdisc_util *qu, int argc, char **argv, if (set_orphan_mask) addattr_l(n, 1024, TCA_FQ_ORPHAN_MASK, &orphan_mask, sizeof(refill_delay)); + if (set_ce_threshold) + addattr_l(n, 1024, TCA_FQ_CE_THRESHOLD, + &ce_threshold, sizeof(ce_threshold)); addattr_nest_end(n, tail); return 0; } @@ -239,6 +252,7 @@ static int fq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) unsigned int rate, quantum; unsigned int refill_delay; unsigned int orphan_mask; + unsigned int ce_threshold; SPRINT_BUF(b1); @@ -310,21 +324,28 @@ static int fq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) fprintf(f, "refill_delay %s ", sprint_time(refill_delay, b1)); } + if (tb[TCA_FQ_CE_THRESHOLD] && + RTA_PAYLOAD(tb[TCA_FQ_CE_THRESHOLD]) >= sizeof(__u32)) { + ce_threshold = rta_getattr_u32(tb[TCA_FQ_CE_THRESHOLD]); + if (ce_threshold != ~0U) + fprintf(f, "ce_threshold %s ", sprint_time(ce_threshold, b1)); + } + return 0; } static int fq_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) { - struct tc_fq_qd_stats *st; + struct tc_fq_qd_stats *st, _st; if (xstats == NULL) return 0; - if (RTA_PAYLOAD(xstats) < sizeof(*st)) - return -1; + memset(&_st, 0, sizeof(_st)); + memcpy(&_st, RTA_DATA(xstats), min(RTA_PAYLOAD(xstats), sizeof(*st))); - st = RTA_DATA(xstats); + st = &_st; fprintf(f, " %u flows (%u inactive, %u throttled)", st->flows, st->inactive_flows, st->throttled_flows); @@ -343,6 +364,9 @@ static int fq_print_xstats(struct qdisc_util *qu, FILE *f, if (st->unthrottle_latency_ns) fprintf(f, ", %u ns latency", st->unthrottle_latency_ns); + if (st->ce_mark) + fprintf(f, ", %llu ce_mark", st->ce_mark); + if (st->flows_plimit) fprintf(f, ", %llu flows_plimit", st->flows_plimit); From 6d03d6f7d92807b210acb1a9e006a6745a53db6c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 24 Nov 2018 17:44:36 -0800 Subject: [PATCH 21/50] man: tc: update man page for fq packet scheduler Signed-off-by: Eric Dumazet Signed-off-by: David Ahern --- man/man8/tc-fq.8 | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/man/man8/tc-fq.8 b/man/man8/tc-fq.8 index f058a05a..1febe62b 100644 --- a/man/man8/tc-fq.8 +++ b/man/man8/tc-fq.8 @@ -15,23 +15,28 @@ BYTES ] [ .B maxrate RATE ] [ .B buckets -NUMBER ] [ +NUMBER ] [ +.B orphan_mask +NUMBER ] [ .B pacing | .B nopacing -] +] [ +.B ce_threshold +TIME ] .SH DESCRIPTION FQ (Fair Queue) is a classless packet scheduler meant to be mostly used for locally generated traffic. It is designed to achieve per flow pacing. FQ does flow separation, and is able to respect pacing requirements set by TCP stack. All packets belonging to a socket are considered as a 'flow'. -For non local packets (router workload), packet rxhash is used as fallback. +For non local packets (router workload), packet hash is used as fallback. An application can specify a maximum pacing rate using the .B SO_MAX_PACING_RATE setsockopt call. This packet scheduler adds delay between packets to -respect rate limitation set by TCP stack. +respect rate limitation set on each socket. Note that after linux-4.20, linux adopted EDT (Earliest Departure Time) +and TCP directly sets the appropriate Departure Time for each skb. Dequeueing happens in a round-robin fashion. A special FIFO queue is reserved for high priority packets ( @@ -72,18 +77,28 @@ is ignored only if it is larger than this value. The size of the hash table used for flow lookups. Each bucket is assigned a red-black tree for efficient collision sorting. Default: 1024. +.SS orphan_mask +For packets not owned by a socket, fq is able to mask a part of skb->hash +and reduce number of buckets associated with the traffic. This is a DDOS +prevention mechanism, and the default is 1023 (meaning no more than 1024 flows +are allocated for these packets) .SS [no]pacing Enable or disable flow pacing. Default is enabled. +.SS ce_threshold +sets a threshold above which all packets are marked with ECN Congestion +Experienced. This is useful for DCTCP-style congestion control algorithms that +require marking at very shallow queueing thresholds. + .SH EXAMPLES -#tc qdisc add dev eth0 root fq +#tc qdisc add dev eth0 root est 1sec 4sec fq ce_threshold 4ms .br -#tc -s -d qdisc +#tc -s -d qdisc sh dev eth0 .br -qdisc fq 8003: dev eth0 root refcnt 2 limit 10000p flow_limit 100p buckets 1024 quantum 3028 initial_quantum 15140 - Sent 503727981 bytes 1146972 pkt (dropped 0, overlimits 0 requeues 54452) - backlog 0b 0p requeues 54452 - 1289 flows (1289 inactive, 0 throttled) - 0 gc, 31 highprio, 27411 throttled +qdisc fq 800e: root refcnt 9 limit 10000p flow_limit 1000p buckets 1024 orphan_mask 1023 quantum 3028 initial_quantum 15140 low_rate_threshold 550Kbit refill_delay 40.0ms ce_threshold 4.0ms + Sent 533368436185 bytes 352296695 pkt (dropped 0, overlimits 0 requeues 1339864) + rate 39220Mbit 3238202pps backlog 12417828b 358p requeues 1339864 + 1052 flows (852 inactive, 0 throttled) + 112 gc, 0 highprio, 212 throttled, 21501 ns latency, 470241 ce_mark .br .SH SEE ALSO .BR tc (8), From dd7d522a67b2cd65fab8c7ecd9a695e099d9f66e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 3 Dec 2018 16:01:07 -0800 Subject: [PATCH 22/50] Revert "tc: flower: Classify packets based port ranges" This reverts commit e20e50b0c1643d371dc6597dfc38355d235c84a0. Inadvertently pushed v3 of this patch. Signed-off-by: David Ahern --- tc/f_flower.c | 143 ++++---------------------------------------------- 1 file changed, 10 insertions(+), 133 deletions(-) diff --git a/tc/f_flower.c b/tc/f_flower.c index 9bddf7be..65fca043 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -494,68 +494,6 @@ static int flower_parse_port(char *str, __u8 ip_proto, return 0; } -static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint type, - __be16 *min_port_type, - __be16 *max_port_type) -{ - if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP || - ip_proto == IPPROTO_SCTP) { - if (type == FLOWER_ENDPOINT_SRC) { - *min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN; - *max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX; - } else { - *min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN; - *max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX; - } - } else { - return -1; - } - - return 0; -} - -static int flower_parse_port_range(__be16 *min, __be16 *max, __u8 ip_proto, - enum flower_endpoint endpoint, - struct nlmsghdr *n) -{ - __be16 min_port_type, max_port_type; - - if (flower_port_range_attr_type(ip_proto, endpoint, &min_port_type, - &max_port_type)) - return -1; - - addattr16(n, MAX_MSG, min_port_type, *min); - addattr16(n, MAX_MSG, max_port_type, *max); - - return 0; -} - -static int get_range(__be16 *min, __be16 *max, char *argv) -{ - char *r; - - r = strchr(argv, '-'); - if (r) { - *r = '\0'; - if (get_be16(min, argv, 10)) { - fprintf(stderr, "invalid min range\n"); - return -1; - } - if (get_be16(max, r + 1, 10)) { - fprintf(stderr, "invalid max range\n"); - return -1; - } - if (htons(*max) <= htons(*min)) { - fprintf(stderr, "max value should be greater than min value\n"); - return -1; - } - } else { - fprintf(stderr, "Illegal range format\n"); - return -1; - } - return 0; -} - #define TCP_FLAGS_MAX_MASK 0xfff static int flower_parse_tcp_flags(char *str, int flags_type, int mask_type, @@ -1123,54 +1061,20 @@ static int flower_parse_opt(struct filter_util *qu, char *handle, return -1; } } else if (matches(*argv, "dst_port") == 0) { - __be16 min, max; - NEXT_ARG(); - if (matches(*argv, "range") == 0) { - NEXT_ARG(); - ret = get_range(&min, &max, *argv); - if (ret < 0) - return -1; - ret = flower_parse_port_range(&min, &max, - ip_proto, - FLOWER_ENDPOINT_DST, - n); - if (ret < 0) { - fprintf(stderr, "Illegal \"dst_port range\"\n"); - return -1; - } - } else { - ret = flower_parse_port(*argv, ip_proto, - FLOWER_ENDPOINT_DST, n); - if (ret < 0) { - fprintf(stderr, "Illegal \"dst_port\"\n"); - return -1; - } + ret = flower_parse_port(*argv, ip_proto, + FLOWER_ENDPOINT_DST, n); + if (ret < 0) { + fprintf(stderr, "Illegal \"dst_port\"\n"); + return -1; } } else if (matches(*argv, "src_port") == 0) { - __be16 min, max; - NEXT_ARG(); - if (matches(*argv, "range") == 0) { - NEXT_ARG(); - ret = get_range(&min, &max, *argv); - if (ret < 0) - return -1; - ret = flower_parse_port_range(&min, &max, - ip_proto, - FLOWER_ENDPOINT_SRC, - n); - if (ret < 0) { - fprintf(stderr, "Illegal \"src_port range\"\n"); - return -1; - } - } else { - ret = flower_parse_port(*argv, ip_proto, - FLOWER_ENDPOINT_SRC, n); - if (ret < 0) { - fprintf(stderr, "Illegal \"src_port\"\n"); - return -1; - } + ret = flower_parse_port(*argv, ip_proto, + FLOWER_ENDPOINT_SRC, n); + if (ret < 0) { + fprintf(stderr, "Illegal \"src_port\"\n"); + return -1; } } else if (matches(*argv, "tcp_flags") == 0) { NEXT_ARG(); @@ -1586,22 +1490,6 @@ static void flower_print_port(char *name, struct rtattr *attr) print_hu(PRINT_ANY, name, namefrm, rta_getattr_be16(attr)); } -static void flower_print_port_range(char *name, struct rtattr *min_attr, - struct rtattr *max_attr) -{ - SPRINT_BUF(namefrm); - SPRINT_BUF(out); - size_t done; - - if (!min_attr || !max_attr) - return; - - done = sprintf(out, "%u", rta_getattr_be16(min_attr)); - sprintf(out + done, "-%u", rta_getattr_be16(max_attr)); - sprintf(namefrm, "\n %s %%s", name); - print_string(PRINT_ANY, name, namefrm, out); -} - static void flower_print_tcp_flags(const char *name, struct rtattr *flags_attr, struct rtattr *mask_attr) { @@ -1790,7 +1678,6 @@ static int flower_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle) { struct rtattr *tb[TCA_FLOWER_MAX + 1]; - __be16 min_port_type, max_port_type; int nl_type, nl_mask_type; __be16 eth_type = 0; __u8 ip_proto = 0xff; @@ -1909,16 +1796,6 @@ static int flower_print_opt(struct filter_util *qu, FILE *f, if (nl_type >= 0) flower_print_port("src_port", tb[nl_type]); - if (!flower_port_range_attr_type(ip_proto, FLOWER_ENDPOINT_DST, - &min_port_type, &max_port_type)) - flower_print_port_range("dst_port range", - tb[min_port_type], tb[max_port_type]); - - if (!flower_port_range_attr_type(ip_proto, FLOWER_ENDPOINT_SRC, - &min_port_type, &max_port_type)) - flower_print_port_range("src_port range", - tb[min_port_type], tb[max_port_type]); - flower_print_tcp_flags("tcp_flags", tb[TCA_FLOWER_KEY_TCP_FLAGS], tb[TCA_FLOWER_KEY_TCP_FLAGS_MASK]); From 8930840e678b959226ee39276d6e54ab2fea8801 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Tue, 27 Nov 2018 14:40:03 -0800 Subject: [PATCH 23/50] tc: flower: Classify packets based port ranges Added support for filtering based on port ranges. UAPI changes have been accepted into net-next. Example: 1. Match on a port range: ------------------------- $ tc filter add dev enp4s0 protocol ip parent ffff:\ prio 1 flower ip_proto tcp dst_port 20-30 skip_hw\ action drop $ tc -s filter show dev enp4s0 parent ffff: filter protocol ip pref 1 flower chain 0 filter protocol ip pref 1 flower chain 0 handle 0x1 eth_type ipv4 ip_proto tcp dst_port 20-30 skip_hw not_in_hw action order 1: gact action drop random type none pass val 0 index 1 ref 1 bind 1 installed 85 sec used 3 sec Action statistics: Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0) backlog 0b 0p requeues 0 2. Match on IP address and port range: -------------------------------------- $ tc filter add dev enp4s0 protocol ip parent ffff:\ prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port 100-200\ skip_hw action drop $ tc -s filter show dev enp4s0 parent ffff: filter protocol ip pref 1 flower chain 0 handle 0x2 eth_type ipv4 ip_proto tcp dst_ip 192.168.1.1 dst_port 100-200 skip_hw not_in_hw action order 1: gact action drop random type none pass val 0 index 2 ref 1 bind 1 installed 58 sec used 2 sec Action statistics: Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0) backlog 0b 0p requeues 0 v6: Modified to change json output format as object for sport/dport. "dst_port":{ "start":2000, "end":6000 }, "src_port":{ "start":50, "end":60 } v5: Simplified some code and used 'sscanf' for parsing. Removed space in output format. v4: Added man updates explaining filtering based on port ranges. Removed 'range' keyword. v3: Modified flower_port_range_attr_type calls. v2: Addressed Jiri's comment to sync output format with input Signed-off-by: Amritha Nambiar Signed-off-by: David Ahern --- man/man8/tc-flower.8 | 13 ++++--- tc/f_flower.c | 89 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 86 insertions(+), 16 deletions(-) diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8 index 8be88825..adff41e3 100644 --- a/man/man8/tc-flower.8 +++ b/man/man8/tc-flower.8 @@ -56,8 +56,9 @@ flower \- flow based traffic control filter .IR MASKED_IP_TTL " | { " .BR dst_ip " | " src_ip " } " .IR PREFIX " | { " -.BR dst_port " | " src_port " } " -.IR port_number " } | " +.BR dst_port " | " src_port " } { " +.IR port_number " | " +.IR min_port_number-max_port_number " } | " .B tcp_flags .IR MASKED_TCP_FLAGS " | " .B type @@ -220,10 +221,12 @@ must be a valid IPv4 or IPv6 address, depending on the \fBprotocol\fR option to tc filter, optionally followed by a slash and the prefix length. If the prefix is missing, \fBtc\fR assumes a full-length host match. .TP -.BI dst_port " NUMBER" +.IR \fBdst_port " { " NUMBER " | " " MIN_VALUE-MAX_VALUE " } .TQ -.BI src_port " NUMBER" -Match on layer 4 protocol source or destination port number. Only available for +.IR \fBsrc_port " { " NUMBER " | " " MIN_VALUE-MAX_VALUE " } +Match on layer 4 protocol source or destination port number. Alternatively, the +mininum and maximum values can be specified to match on a range of layer 4 +protocol source or destination port numbers. Only available for .BR ip_proto " values " udp ", " tcp " and " sctp which have to be specified in beforehand. .TP diff --git a/tc/f_flower.c b/tc/f_flower.c index 65fca043..c5636667 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -473,24 +473,57 @@ static int flower_port_attr_type(__u8 ip_proto, enum flower_endpoint endpoint) return -1; } +static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint type, + __be16 *min_port_type, + __be16 *max_port_type) +{ + if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP || + ip_proto == IPPROTO_SCTP) { + if (type == FLOWER_ENDPOINT_SRC) { + *min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN; + *max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX; + } else { + *min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN; + *max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX; + } + } else { + return -1; + } + return 0; +} + static int flower_parse_port(char *str, __u8 ip_proto, enum flower_endpoint endpoint, struct nlmsghdr *n) { + __u16 min, max; int ret; - int type; - __be16 port; - type = flower_port_attr_type(ip_proto, endpoint); - if (type < 0) + ret = sscanf(str, "%hu-%hu", &min, &max); + + if (ret == 1) { + int type; + + type = flower_port_attr_type(ip_proto, endpoint); + if (type < 0) + return -1; + addattr16(n, MAX_MSG, type, htons(min)); + } else if (ret == 2) { + __be16 min_port_type, max_port_type; + + if (max <= min) { + fprintf(stderr, "max value should be greater than min value\n"); + return -1; + } + if (flower_port_range_attr_type(ip_proto, endpoint, + &min_port_type, &max_port_type)) + return -1; + + addattr16(n, MAX_MSG, min_port_type, htons(min)); + addattr16(n, MAX_MSG, max_port_type, htons(max)); + } else { return -1; - - ret = get_be16(&port, str, 10); - if (ret) - return -1; - - addattr16(n, MAX_MSG, type, port); - + } return 0; } @@ -1490,6 +1523,29 @@ static void flower_print_port(char *name, struct rtattr *attr) print_hu(PRINT_ANY, name, namefrm, rta_getattr_be16(attr)); } +static void flower_print_port_range(char *name, struct rtattr *min_attr, + struct rtattr *max_attr) +{ + if (!min_attr || !max_attr) + return; + + if (is_json_context()) { + open_json_object(name); + print_hu(PRINT_JSON, "start", NULL, rta_getattr_be16(min_attr)); + print_hu(PRINT_JSON, "end", NULL, rta_getattr_be16(max_attr)); + close_json_object(); + } else { + SPRINT_BUF(namefrm); + SPRINT_BUF(out); + size_t done; + + done = sprintf(out, "%u", rta_getattr_be16(min_attr)); + sprintf(out + done, "-%u", rta_getattr_be16(max_attr)); + sprintf(namefrm, "\n %s %%s", name); + print_string(PRINT_ANY, name, namefrm, out); + } +} + static void flower_print_tcp_flags(const char *name, struct rtattr *flags_attr, struct rtattr *mask_attr) { @@ -1678,6 +1734,7 @@ static int flower_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle) { struct rtattr *tb[TCA_FLOWER_MAX + 1]; + __be16 min_port_type, max_port_type; int nl_type, nl_mask_type; __be16 eth_type = 0; __u8 ip_proto = 0xff; @@ -1796,6 +1853,16 @@ static int flower_print_opt(struct filter_util *qu, FILE *f, if (nl_type >= 0) flower_print_port("src_port", tb[nl_type]); + if (!flower_port_range_attr_type(ip_proto, FLOWER_ENDPOINT_DST, + &min_port_type, &max_port_type)) + flower_print_port_range("dst_port", + tb[min_port_type], tb[max_port_type]); + + if (!flower_port_range_attr_type(ip_proto, FLOWER_ENDPOINT_SRC, + &min_port_type, &max_port_type)) + flower_print_port_range("src_port", + tb[min_port_type], tb[max_port_type]); + flower_print_tcp_flags("tcp_flags", tb[TCA_FLOWER_KEY_TCP_FLAGS], tb[TCA_FLOWER_KEY_TCP_FLAGS_MASK]); From 2557dca2b028a6c8f1f13b3fe28d8d3c14c2e254 Mon Sep 17 00:00:00 2001 From: Shalom Toledo Date: Tue, 4 Dec 2018 10:14:07 +0000 Subject: [PATCH 24/50] devlink: Add string to uint{8,16,32} conversion for generic parameters Allow setting u{8,16,32} generic parameters as a well defined strings in devlink user space tool. Signed-off-by: Shalom Toledo Reviewed-by: Jiri Pirko Signed-off-by: David Ahern --- devlink/devlink.c | 145 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 135 insertions(+), 10 deletions(-) diff --git a/devlink/devlink.c b/devlink/devlink.c index 8bb254ea..1e3deb24 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -1920,10 +1920,69 @@ static int cmd_dev_eswitch(struct dl *dl) return -ENOENT; } -static void pr_out_param_value(struct dl *dl, int nla_type, struct nlattr *nl) +struct param_val_conv { + const char *name; + const char *vstr; + uint32_t vuint; +}; + +static bool param_val_conv_exists(const struct param_val_conv *param_val_conv, + uint32_t len, const char *name) +{ + uint32_t i; + + for (i = 0; i < len; i++) + if (!strcmp(param_val_conv[i].name, name)) + return true; + + return false; +} + +static int +param_val_conv_uint_get(const struct param_val_conv *param_val_conv, + uint32_t len, const char *name, const char *vstr, + uint32_t *vuint) +{ + uint32_t i; + + for (i = 0; i < len; i++) + if (!strcmp(param_val_conv[i].name, name) && + !strcmp(param_val_conv[i].vstr, vstr)) { + *vuint = param_val_conv[i].vuint; + return 0; + } + + return -ENOENT; +} + +static int +param_val_conv_str_get(const struct param_val_conv *param_val_conv, + uint32_t len, const char *name, uint32_t vuint, + const char **vstr) +{ + uint32_t i; + + for (i = 0; i < len; i++) + if (!strcmp(param_val_conv[i].name, name) && + param_val_conv[i].vuint == vuint) { + *vstr = param_val_conv[i].vstr; + return 0; + } + + return -ENOENT; +} + +static const struct param_val_conv param_val_conv[] = {}; + +#define PARAM_VAL_CONV_LEN ARRAY_SIZE(param_val_conv) + +static void pr_out_param_value(struct dl *dl, const char *nla_name, + int nla_type, struct nlattr *nl) { struct nlattr *nla_value[DEVLINK_ATTR_MAX + 1] = {}; struct nlattr *val_attr; + const char *vstr; + bool conv_exists; int err; err = mnl_attr_parse_nested(nl, attr_cb, nla_value); @@ -1939,15 +1998,51 @@ static void pr_out_param_value(struct dl *dl, int nla_type, struct nlattr *nl) param_cmode_name(mnl_attr_get_u8(nla_value[DEVLINK_ATTR_PARAM_VALUE_CMODE]))); val_attr = nla_value[DEVLINK_ATTR_PARAM_VALUE_DATA]; + conv_exists = param_val_conv_exists(param_val_conv, PARAM_VAL_CONV_LEN, + nla_name); + switch (nla_type) { case MNL_TYPE_U8: - pr_out_uint(dl, "value", mnl_attr_get_u8(val_attr)); + if (conv_exists) { + err = param_val_conv_str_get(param_val_conv, + PARAM_VAL_CONV_LEN, + nla_name, + mnl_attr_get_u8(val_attr), + &vstr); + if (err) + return; + pr_out_str(dl, "value", vstr); + } else { + pr_out_uint(dl, "value", mnl_attr_get_u8(val_attr)); + } break; case MNL_TYPE_U16: - pr_out_uint(dl, "value", mnl_attr_get_u16(val_attr)); + if (conv_exists) { + err = param_val_conv_str_get(param_val_conv, + PARAM_VAL_CONV_LEN, + nla_name, + mnl_attr_get_u16(val_attr), + &vstr); + if (err) + return; + pr_out_str(dl, "value", vstr); + } else { + pr_out_uint(dl, "value", mnl_attr_get_u16(val_attr)); + } break; case MNL_TYPE_U32: - pr_out_uint(dl, "value", mnl_attr_get_u32(val_attr)); + if (conv_exists) { + err = param_val_conv_str_get(param_val_conv, + PARAM_VAL_CONV_LEN, + nla_name, + mnl_attr_get_u32(val_attr), + &vstr); + if (err) + return; + pr_out_str(dl, "value", vstr); + } else { + pr_out_uint(dl, "value", mnl_attr_get_u32(val_attr)); + } break; case MNL_TYPE_STRING: pr_out_str(dl, "value", mnl_attr_get_str(val_attr)); @@ -1962,6 +2057,7 @@ static void pr_out_param(struct dl *dl, struct nlattr **tb, bool array) { struct nlattr *nla_param[DEVLINK_ATTR_MAX + 1] = {}; struct nlattr *param_value_attr; + const char *nla_name; int nla_type; int err; @@ -1980,8 +2076,8 @@ static void pr_out_param(struct dl *dl, struct nlattr **tb, bool array) nla_type = mnl_attr_get_u8(nla_param[DEVLINK_ATTR_PARAM_TYPE]); - pr_out_str(dl, "name", - mnl_attr_get_str(nla_param[DEVLINK_ATTR_PARAM_NAME])); + nla_name = mnl_attr_get_str(nla_param[DEVLINK_ATTR_PARAM_NAME]); + pr_out_str(dl, "name", nla_name); if (!nla_param[DEVLINK_ATTR_PARAM_GENERIC]) pr_out_str(dl, "type", "driver-specific"); @@ -1992,7 +2088,7 @@ static void pr_out_param(struct dl *dl, struct nlattr **tb, bool array) mnl_attr_for_each_nested(param_value_attr, nla_param[DEVLINK_ATTR_PARAM_VALUES_LIST]) { pr_out_entry_start(dl); - pr_out_param_value(dl, nla_type, param_value_attr); + pr_out_param_value(dl, nla_name, nla_type, param_value_attr); pr_out_entry_end(dl); } pr_out_array_end(dl); @@ -2097,6 +2193,7 @@ static int cmd_dev_param_set(struct dl *dl) { struct param_ctx ctx = {}; struct nlmsghdr *nlh; + bool conv_exists; uint32_t val_u32; uint16_t val_u16; uint8_t val_u8; @@ -2124,10 +2221,22 @@ static int cmd_dev_param_set(struct dl *dl) NLM_F_REQUEST | NLM_F_ACK); dl_opts_put(nlh, dl); + conv_exists = param_val_conv_exists(param_val_conv, PARAM_VAL_CONV_LEN, + dl->opts.param_name); + mnl_attr_put_u8(nlh, DEVLINK_ATTR_PARAM_TYPE, ctx.nla_type); switch (ctx.nla_type) { case MNL_TYPE_U8: - err = strtouint8_t(dl->opts.param_value, &val_u8); + if (conv_exists) { + err = param_val_conv_uint_get(param_val_conv, + PARAM_VAL_CONV_LEN, + dl->opts.param_name, + dl->opts.param_value, + &val_u32); + val_u8 = val_u32; + } else { + err = strtouint8_t(dl->opts.param_value, &val_u8); + } if (err) goto err_param_value_parse; if (val_u8 == ctx.value.vu8) @@ -2135,7 +2244,16 @@ static int cmd_dev_param_set(struct dl *dl) mnl_attr_put_u8(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, val_u8); break; case MNL_TYPE_U16: - err = strtouint16_t(dl->opts.param_value, &val_u16); + if (conv_exists) { + err = param_val_conv_uint_get(param_val_conv, + PARAM_VAL_CONV_LEN, + dl->opts.param_name, + dl->opts.param_value, + &val_u32); + val_u16 = val_u32; + } else { + err = strtouint16_t(dl->opts.param_value, &val_u16); + } if (err) goto err_param_value_parse; if (val_u16 == ctx.value.vu16) @@ -2143,7 +2261,14 @@ static int cmd_dev_param_set(struct dl *dl) mnl_attr_put_u16(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, val_u16); break; case MNL_TYPE_U32: - err = strtouint32_t(dl->opts.param_value, &val_u32); + if (conv_exists) + err = param_val_conv_uint_get(param_val_conv, + PARAM_VAL_CONV_LEN, + dl->opts.param_name, + dl->opts.param_value, + &val_u32); + else + err = strtouint32_t(dl->opts.param_value, &val_u32); if (err) goto err_param_value_parse; if (val_u32 == ctx.value.vu32) From a463fd4fa4c45eb47b0719217b4c8b90a544da72 Mon Sep 17 00:00:00 2001 From: Shalom Toledo Date: Tue, 4 Dec 2018 10:14:08 +0000 Subject: [PATCH 25/50] devlink: Add support for 'fw_load_policy' generic parameter Add string to uint conversion for 'fw_load_policy' generic parameter. Signed-off-by: Shalom Toledo Reviewed-by: Jiri Pirko Signed-off-by: David Ahern --- devlink/devlink.c | 13 ++++++++++++- include/uapi/linux/devlink.h | 5 +++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/devlink/devlink.c b/devlink/devlink.c index 1e3deb24..3651e90c 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -1972,7 +1972,18 @@ param_val_conv_str_get(const struct param_val_conv *param_val_conv, return -ENOENT; } -static const struct param_val_conv param_val_conv[] = {}; +static const struct param_val_conv param_val_conv[] = { + { + .name = "fw_load_policy", + .vstr = "driver", + .vuint = DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER, + }, + { + .name = "fw_load_policy", + .vstr = "flash", + .vuint = DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH, + }, +}; #define PARAM_VAL_CONV_LEN ARRAY_SIZE(param_val_conv) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 5ee0e739..d0a33d79 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -163,6 +163,11 @@ enum devlink_param_cmode { DEVLINK_PARAM_CMODE_MAX = __DEVLINK_PARAM_CMODE_MAX - 1 }; +enum devlink_param_fw_load_policy_value { + DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER, + DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH, +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, From 738aebe52b2e53b45bb0e8b55fe341fdb25ff994 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 11 Dec 2018 08:40:09 -0800 Subject: [PATCH 26/50] drop support for DECnet DECnet belongs in the history museum of dead protocols along with Appletalk and IPX. Linux support has outlived its natural life and the time has come to remove it from iproute2. Dead code is a source of bugs and exploits. If anyone actually has DECnet running on some old distribution they can just keep to the old version of iproute2. Signed-off-by: Stephen Hemminger Signed-off-by: David Ahern --- Makefile | 3 -- README.decnet | 33 -------------- README.lnstat | 2 +- include/utils.h | 3 -- ip/ip.c | 2 +- ip/iproute.c | 2 +- lib/dnet_ntop.c | 101 ----------------------------------------- lib/dnet_pton.c | 75 ------------------------------ lib/utils.c | 23 ---------- man/man8/ip-route.8.in | 2 +- man/man8/ip.8 | 9 +--- 11 files changed, 6 insertions(+), 249 deletions(-) delete mode 100644 README.decnet delete mode 100644 lib/dnet_ntop.c delete mode 100644 lib/dnet_pton.c diff --git a/Makefile b/Makefile index 7d62468c..ad96fd54 100644 --- a/Makefile +++ b/Makefile @@ -40,9 +40,6 @@ DEFINES+=-DCONFDIR=\"$(CONFDIR)\" \ -DNETNS_RUN_DIR=\"$(NETNS_RUN_DIR)\" \ -DNETNS_ETC_DIR=\"$(NETNS_ETC_DIR)\" -#options for decnet -ADDLIB+=dnet_ntop.o dnet_pton.o - #options for mpls ADDLIB+=mpls_ntop.o mpls_pton.o diff --git a/README.decnet b/README.decnet deleted file mode 100644 index 4300f906..00000000 --- a/README.decnet +++ /dev/null @@ -1,33 +0,0 @@ - -Here are a few quick points about DECnet support... - - o iproute2 is the tool of choice for configuring the DECnet support for - Linux. For many features, it is the only tool which can be used to - configure them. - - o No name resolution is available as yet, all addresses must be - entered numerically. - - o Remember to set the hardware address of the interface using: - - ip link set ethX address xx:xx:xx:xx:xx:xx - (where xx:xx:xx:xx:xx:xx is the MAC address for your DECnet node - address) - - if your Ethernet card won't listen to more than one unicast - mac address at once. If the Linux DECnet stack doesn't talk to - any other DECnet nodes, then check this with tcpdump and if its - a problem, change the mac address (but do this _before_ starting - any other network protocol on the interface) - - o Whilst you can use ip addr add to add more than one DECnet address to an - interface, don't expect addresses which are not the same as the - kernels node address to work properly with 2.4 kernels. This should - be fine with 2.6 kernels as the routing code has been extensively - modified and improved. - - o The DECnet support is currently self contained. It does not depend on - the libdnet library. - -Steve Whitehouse - diff --git a/README.lnstat b/README.lnstat index 057925f6..59134a15 100644 --- a/README.lnstat +++ b/README.lnstat @@ -9,7 +9,7 @@ In addition to routing cache statistics, it supports any kind of statistics the linux kernel exports via a file in /proc/net/stat. In a stock 2.6.9 kernel, this is per-protocol neighbour cache statistics - (ipv4, ipv6, atm, decnet) + (ipv4, ipv6, atm) routing cache statistics (ipv4) connection tracking statistics diff --git a/include/utils.h b/include/utils.h index fee7ff28..92bbe82d 100644 --- a/include/utils.h +++ b/include/utils.h @@ -194,9 +194,6 @@ int matches(const char *arg, const char *pattern); int inet_addr_match(const inet_prefix *a, const inet_prefix *b, int bits); int inet_addr_match_rta(const inet_prefix *m, const struct rtattr *rta); -const char *dnet_ntop(int af, const void *addr, char *str, size_t len); -int dnet_pton(int af, const char *src, void *addr); - const char *mpls_ntop(int af, const void *addr, char *str, size_t len); int mpls_pton(int af, const char *src, void *addr, size_t alen); diff --git a/ip/ip.c b/ip/ip.c index 11dbed72..a5bbacb4 100644 --- a/ip/ip.c +++ b/ip/ip.c @@ -53,7 +53,7 @@ static void usage(void) " vrf | sr }\n" " OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n" " -h[uman-readable] | -iec | -j[son] | -p[retty] |\n" -" -f[amily] { inet | inet6 | dnet | mpls | bridge | link } |\n" +" -f[amily] { inet | inet6 | mpls | bridge | link } |\n" " -4 | -6 | -I | -D | -M | -B | -0 |\n" " -l[oops] { maximum-addr-flush-attempts } | -br[ief] |\n" " -o[neline] | -t[imestamp] | -ts[hort] | -b[atch] [filename] |\n" diff --git a/ip/iproute.c b/ip/iproute.c index 26f7cd89..60e46e03 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -83,7 +83,7 @@ static void usage(void) "INFO_SPEC := NH OPTIONS FLAGS [ nexthop NH ]...\n" "NH := [ encap ENCAPTYPE ENCAPHDR ] [ via [ FAMILY ] ADDRESS ]\n" " [ dev STRING ] [ weight NUMBER ] NHFLAGS\n" - "FAMILY := [ inet | inet6 | dnet | mpls | bridge | link ]\n" + "FAMILY := [ inet | inet6 | mpls | bridge | link ]\n" "OPTIONS := FLAGS [ mtu NUMBER ] [ advmss NUMBER ] [ as [ to ] ADDRESS ]\n" " [ rtt TIME ] [ rttvar TIME ] [ reordering NUMBER ]\n" " [ window NUMBER ] [ cwnd NUMBER ] [ initcwnd NUMBER ]\n" diff --git a/lib/dnet_ntop.c b/lib/dnet_ntop.c deleted file mode 100644 index 17d960e3..00000000 --- a/lib/dnet_ntop.c +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include -#include - -#include "utils.h" - -static __inline__ u_int16_t dn_ntohs(u_int16_t addr) -{ - union { - u_int8_t byte[2]; - u_int16_t word; - } u; - - u.word = addr; - return ((u_int16_t)u.byte[0]) | (((u_int16_t)u.byte[1]) << 8); -} - -static __inline__ int do_digit(char *str, u_int16_t *addr, u_int16_t scale, size_t *pos, size_t len, int *started) -{ - u_int16_t tmp = *addr / scale; - - if (*pos == len) - return 1; - - if (((tmp) > 0) || *started || (scale == 1)) { - *str = tmp + '0'; - *started = 1; - (*pos)++; - *addr -= (tmp * scale); - } - - return 0; -} - - -static const char *dnet_ntop1(const struct dn_naddr *dna, char *str, size_t len) -{ - u_int16_t addr, area; - size_t pos = 0; - int started = 0; - - memcpy(&addr, dna->a_addr, sizeof(addr)); - addr = dn_ntohs(addr); - area = addr >> 10; - - if (dna->a_len != 2) - return NULL; - - addr &= 0x03ff; - - if (len == 0) - return str; - - if (do_digit(str + pos, &area, 10, &pos, len, &started)) - return str; - - if (do_digit(str + pos, &area, 1, &pos, len, &started)) - return str; - - if (pos == len) - return str; - - *(str + pos) = '.'; - pos++; - started = 0; - - if (do_digit(str + pos, &addr, 1000, &pos, len, &started)) - return str; - - if (do_digit(str + pos, &addr, 100, &pos, len, &started)) - return str; - - if (do_digit(str + pos, &addr, 10, &pos, len, &started)) - return str; - - if (do_digit(str + pos, &addr, 1, &pos, len, &started)) - return str; - - if (pos == len) - return str; - - *(str + pos) = 0; - - return str; -} - - -const char *dnet_ntop(int af, const void *addr, char *str, size_t len) -{ - switch(af) { - case AF_DECnet: - errno = 0; - return dnet_ntop1((struct dn_naddr *)addr, str, len); - default: - errno = EAFNOSUPPORT; - } - - return NULL; -} diff --git a/lib/dnet_pton.c b/lib/dnet_pton.c deleted file mode 100644 index 1cf54e51..00000000 --- a/lib/dnet_pton.c +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include -#include - -#include "utils.h" - -static __inline__ u_int16_t dn_htons(u_int16_t addr) -{ - union { - u_int8_t byte[2]; - u_int16_t word; - } u; - - u.word = addr; - return ((u_int16_t)u.byte[0]) | (((u_int16_t)u.byte[1]) << 8); -} - - -static int dnet_num(const char *src, u_int16_t * dst) -{ - int rv = 0; - int tmp; - *dst = 0; - - while ((tmp = *src++) != 0) { - tmp -= '0'; - if ((tmp < 0) || (tmp > 9)) - return rv; - - rv++; - (*dst) *= 10; - (*dst) += tmp; - } - - return rv; -} - -static int dnet_pton1(const char *src, struct dn_naddr *dna) -{ - u_int16_t addr; - u_int16_t area = 0; - u_int16_t node = 0; - int pos; - - pos = dnet_num(src, &area); - if ((pos == 0) || (area > 63) || (*(src + pos) != '.')) - return 0; - pos = dnet_num(src + pos + 1, &node); - if ((pos == 0) || (node > 1023)) - return 0; - dna->a_len = 2; - addr = dn_htons((area << 10) | node); - memcpy(dna->a_addr, &addr, sizeof(addr)); - - return 1; -} - -int dnet_pton(int af, const char *src, void *addr) -{ - int err; - - switch (af) { - case AF_DECnet: - errno = 0; - err = dnet_pton1(src, (struct dn_naddr *)addr); - break; - default: - errno = EAFNOSUPPORT; - err = -1; - } - - return err; -} diff --git a/lib/utils.c b/lib/utils.c index 22bc6c8d..9ebc8274 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -600,18 +600,6 @@ static int __get_addr_1(inet_prefix *addr, const char *name, int family) return 0; } - if (family == AF_DECnet) { - struct dn_naddr dna; - - addr->family = AF_DECnet; - if (dnet_pton(AF_DECnet, name, &dna) <= 0) - return -1; - memcpy(addr->data, dna.a_addr, 2); - addr->bytelen = 2; - addr->bitlen = -1; - return 0; - } - if (family == AF_MPLS) { unsigned int maxlabels; int i; @@ -1000,13 +988,6 @@ const char *rt_addr_n2a_r(int af, int len, return inet_ntop(af, addr, buf, buflen); case AF_MPLS: return mpls_ntop(af, addr, buf, buflen); - case AF_DECnet: - { - struct dn_naddr dna = { 2, { 0, 0, } }; - - memcpy(dna.a_addr, addr, 2); - return dnet_ntop(af, &dna, buf, buflen); - } case AF_PACKET: return ll_addr_n2a(addr, len, ARPHRD_VOID, buf, buflen); case AF_BRIDGE: @@ -1048,8 +1029,6 @@ int read_family(const char *name) family = AF_INET; else if (strcmp(name, "inet6") == 0) family = AF_INET6; - else if (strcmp(name, "dnet") == 0) - family = AF_DECnet; else if (strcmp(name, "link") == 0) family = AF_PACKET; else if (strcmp(name, "ipx") == 0) @@ -1067,8 +1046,6 @@ const char *family_name(int family) return "inet"; if (family == AF_INET6) return "inet6"; - if (family == AF_DECnet) - return "dnet"; if (family == AF_PACKET) return "link"; if (family == AF_IPX) diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in index a664d848..9603ac6e 100644 --- a/man/man8/ip-route.8.in +++ b/man/man8/ip-route.8.in @@ -107,7 +107,7 @@ replace " } " .ti -8 .IR FAMILY " := [ " -.BR inet " | " inet6 " | " dnet " | " mpls " | " bridge " | " link " ]" +.BR inet " | " inet6 " | " mpls " | " bridge " | " link " ]" .ti -8 .IR OPTIONS " := " FLAGS " [ " diff --git a/man/man8/ip.8 b/man/man8/ip.8 index 16867efb..84ade110 100644 --- a/man/man8/ip.8 +++ b/man/man8/ip.8 @@ -34,7 +34,7 @@ ip \- show / manipulate routing, network devices, interfaces and tunnels \fB\-r\fR[\fIesolve\fR] | \fB\-iec\fR | \fB\-f\fR[\fIamily\fR] { -.BR inet " | " inet6 " | " dnet " | " link " } | " +.BR inet " | " inet6 " | " link " } | " \fB-4\fR | \fB-6\fR | \fB-I\fR | @@ -94,7 +94,7 @@ Zero (0) means loop until all addresses are removed. .TP .BR "\-f" , " \-family " Specifies the protocol family to use. The protocol family identifier can be one of -.BR "inet" , " inet6" , " bridge" , " dnet" , " mpls" +.BR "inet" , " inet6" , " bridge" , " mpls" or .BR link . If this option is not present, @@ -125,11 +125,6 @@ shortcut for shortcut for .BR "\-family bridge" . -.TP -.B \-D -shortcut for -.BR "\-family decnet" . - .TP .B \-M shortcut for From e5cd5a51f9757152235d4ad2cc72340aed5a1ea8 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 18 Dec 2018 07:57:20 -0800 Subject: [PATCH 27/50] doc: remove trailing whitespace Run whitespace scrubbing script to remove unnecessary trailing blanks at end of line and end of file. Signed-off-by: Stephen Hemminger Signed-off-by: David Ahern --- README.iproute2+tc | 2 +- README.lnstat | 15 +++--- doc/actions/actions-general | 99 ++++++++++++++++++------------------- doc/actions/gact-usage | 17 +++---- doc/actions/ifb-README | 62 +++++++++++------------ doc/actions/mirred-usage | 26 +++++----- netem/README.distribution | 2 +- 7 files changed, 110 insertions(+), 113 deletions(-) diff --git a/README.iproute2+tc b/README.iproute2+tc index 5979098e..e7bb48ce 100644 --- a/README.iproute2+tc +++ b/README.iproute2+tc @@ -42,7 +42,7 @@ in rsvp/cbqinit.eth1. Terminology and advices about setting CBQ parameters may be found in Sally Floyd -papers. +papers. Pairs X:Y are class handles, X:0 are qdisc handles. diff --git a/README.lnstat b/README.lnstat index 59134a15..eab4088f 100644 --- a/README.lnstat +++ b/README.lnstat @@ -7,8 +7,8 @@ This tool is a generalized and more feature-complete replacement for the old In addition to routing cache statistics, it supports any kind of statistics the linux kernel exports via a file in /proc/net/stat. In a stock 2.6.9 -kernel, this is - per-protocol neighbour cache statistics +kernel, this is + per-protocol neighbour cache statistics (ipv4, ipv6, atm) routing cache statistics (ipv4) @@ -29,7 +29,7 @@ In order to get a list of supported statistics files, you can run lnstat -d It will display something like - + /proc/net/stat/arp_cache: 1: entries 2: allocs @@ -52,19 +52,19 @@ arp_cach|rt_cache|arp_cach| You can specify the interval (e.g. 10 seconds) by: - + lnstat -i 10 You can specify to only use one particular statistics file: lnstat -f ip_conntrack -You can specify individual field widths +You can specify individual field widths lnstat -k arp_cache:entries,rt_cache:entries -w 20,8 You can specify not to print a header at all - + lnstat -s 0 You can specify to print a header only at start of the program @@ -76,6 +76,5 @@ You can specify to print a header at start and every 20 lines: lnstat -s 20 You can specify the number of samples you want to take (e.g. 5): - - lnstat -c 5 + lnstat -c 5 diff --git a/doc/actions/actions-general b/doc/actions/actions-general index 08cc785c..407a514c 100644 --- a/doc/actions/actions-general +++ b/doc/actions/actions-general @@ -6,8 +6,8 @@ What is it? ----------- An extension to the filtering/classification architecture of Linux Traffic -Control. -Up to 2.6.8 the only action that could be "attached" to a filter was policing. +Control. +Up to 2.6.8 the only action that could be "attached" to a filter was policing. i.e you could say something like: ----- @@ -17,7 +17,7 @@ tc filter add dev lo parent ffff: protocol ip prio 10 u32 match ip src \ which implies "if a packet is seen on the ingress of the lo device with a source IP address of 127.0.0.1/32 we give it a classification id of 1:1 and -we execute a policing action which rate limits its bandwidth utilization +we execute a policing action which rate limits its bandwidth utilization to 1.5Mbps". The new extensions allow for more than just policing actions to be added. @@ -29,9 +29,9 @@ syntax which will work fine. Of course to get the required effect you need both newer tc and kernel. If you are reading this you have the right tc ;-> -A side effect is that we can now get stateless firewalling to work with tc. +A side effect is that we can now get stateless firewalling to work with tc. Essentially this is now an alternative to iptables. -I won't go into details of my dislike for iptables at times, but +I won't go into details of my dislike for iptables at times, but scalability is one of the main issues; however, if you need stateful classification - use netfilter (for now). @@ -61,7 +61,7 @@ tc filter add dev lo parent 1:0 protocol ip prio 10 u32 \ match ip src 127.0.0.1/32 flowid 1:1 \ action police mtu 4000 rate 1500kbit burst 90k -" generic Actions" (gact) at the moment are: +" generic Actions" (gact) at the moment are: { drop, pass, reclassify, continue} (If you have others, no listed here give me a reason and we will add them) +drop says to drop the packet @@ -93,43 +93,43 @@ decimal 12, then use flowid 1:c. 3) A feature i call pipe The motivation is derived from Unix pipe mechanism but applied to packets. -Essentially take a matching packet and pass it through +Essentially take a matching packet and pass it through action1 | action2 | action3 etc. You could do something similar to this with the tc policer and the "continue" -operator but this rather restricts it to just the policer and requires -multiple rules (and lookups, hence quiet inefficient); +operator but this rather restricts it to just the policer and requires +multiple rules (and lookups, hence quiet inefficient); -as an example -- and please note that this is just an example _not_ The +as an example -- and please note that this is just an example _not_ The Word Youve Been Waiting For (yes i have had problems giving examples which ended becoming dogma in documents and people modifying them a little -to look clever); +to look clever); -i selected the metering rates to be small so that i can show better how +i selected the metering rates to be small so that i can show better how things work. - -The script below does the following: -- an incoming packet from 10.0.0.21 is first given a firewall mark of 1. -- It is then metered to make sure it does not exceed its allocated rate of +The script below does the following: +- an incoming packet from 10.0.0.21 is first given a firewall mark of 1. + +- It is then metered to make sure it does not exceed its allocated rate of 1Kbps. If it doesn't exceed rate, this is where we terminate action execution. -- If it does exceed its rate, its "color" changes to a mark of 2 and it is +- If it does exceed its rate, its "color" changes to a mark of 2 and it is then passed through a second meter. --The second meter is shared across all flows on that device [i am surpised -that this seems to be not a well know feature of the policer; Bert was telling +-The second meter is shared across all flows on that device [i am surpised +that this seems to be not a well know feature of the policer; Bert was telling me that someone was writing a qdisc just to do sharing across multiple devices; it must be the summer heat again; weve had someone doing that every year around -summer -- the key to sharing is to use a operator "index" in your policer -rules (example "index 20"). All your rules have to use the same index to +summer -- the key to sharing is to use a operator "index" in your policer +rules (example "index 20"). All your rules have to use the same index to share.] - + -If the second meter is exceeded the color of the flow changes further to 3. -We then pass the packet to another meter which is shared across all devices in the system. If this meter is exceeded we drop the packet. -Note the mark can be used further up the system to do things like policy +Note the mark can be used further up the system to do things like policy or more interesting things on the egress. ------------------ cut here ------------------------------- @@ -161,31 +161,31 @@ action ipt -j mark --set-mark 3 \ # and then attempt to borrow from a meter used by all devices in the # system. Should this be exceeded, drop the packet on the floor. action police index 20 mtu 5000 rate 1kbit burst 90k drop ---------------------------------- +--------------------------------- -Now lets see the actions installed with +Now lets see the actions installed with "tc filter show parent ffff: dev eth0" -------- output ----------- jroot# tc filter show parent ffff: dev eth0 -filter protocol ip pref 1 u32 -filter protocol ip pref 1 u32 fh 800: ht divisor 1 -filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:15 +filter protocol ip pref 1 u32 +filter protocol ip pref 1 u32 fh 800: ht divisor 1 +filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:15 - action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING + action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING target MARK set 0x1 index 2 - action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb + action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb - action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING + action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING target MARK set 0x2 index 1 - action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b + action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b - action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING + action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING target MARK set 0x3 index 3 - action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b + action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b match 0a000015/ffffffff at 12 ------------------------------- @@ -209,31 +209,31 @@ Now lets take a look at the stats with "tc -s filter show parent ffff: dev eth0" -------------- jroot# tc -s filter show parent ffff: dev eth0 -filter protocol ip pref 1 u32 -filter protocol ip pref 1 u32 fh 800: ht divisor 1 +filter protocol ip pref 1 u32 +filter protocol ip pref 1 u32 fh 800: ht divisor 1 filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 -5 +5 - action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING + action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING target MARK set 0x1 index 2 - Sent 188832 bytes 2248 pkts (dropped 0, overlimits 0) + Sent 188832 bytes 2248 pkts (dropped 0, overlimits 0) - action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb - Sent 188832 bytes 2248 pkts (dropped 0, overlimits 2122) + action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb + Sent 188832 bytes 2248 pkts (dropped 0, overlimits 2122) - action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING + action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING target MARK set 0x2 index 1 - Sent 178248 bytes 2122 pkts (dropped 0, overlimits 0) + Sent 178248 bytes 2122 pkts (dropped 0, overlimits 0) - action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b - Sent 178248 bytes 2122 pkts (dropped 0, overlimits 1945) + action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b + Sent 178248 bytes 2122 pkts (dropped 0, overlimits 1945) - action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING + action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING target MARK set 0x3 index 3 - Sent 163380 bytes 1945 pkts (dropped 0, overlimits 0) + Sent 163380 bytes 1945 pkts (dropped 0, overlimits 0) - action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b - Sent 163380 bytes 1945 pkts (dropped 0, overlimits 437) + action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b + Sent 163380 bytes 1945 pkts (dropped 0, overlimits 437) match 0a000015/ffffffff at 12 ------------------------------- @@ -254,4 +254,3 @@ At the moment the focus has been on getting the architecture in place. Expect new things in the spurious time i have to work on this (particularly around end of year when i have typically get time off from work). - diff --git a/doc/actions/gact-usage b/doc/actions/gact-usage index de1308dd..5fc3e628 100644 --- a/doc/actions/gact-usage +++ b/doc/actions/gact-usage @@ -1,13 +1,13 @@ gact [RAND] [INDEX] -Where: - ACTION := reclassify | drop | continue | pass | ok +Where: + ACTION := reclassify | drop | continue | pass | ok RAND := random RANDTYPE := netrand | determ VAL : = value not exceeding 10000 INDEX := index value used - + ACTION semantics - pass and ok are equivalent to accept - continue allows to restart classification lookup @@ -42,14 +42,14 @@ filter u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:16 (rule hit 32 suc random type none pass val 0 index 1 ref 1 bind 1 installed 59 sec used 35 sec Sent 1680 bytes 20 pkts (dropped 20, overlimits 0 ) - + ---- # example 2 #allow 1 out 10 randomly using the netrand generator tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \ 10.0.0.9/32 flowid 1:16 action drop random netrand ok 10 - + ping -c 20 10.0.0.9 ---- @@ -59,14 +59,14 @@ filter protocol ip pref 6 u32 filter protocol ip pref 6 u32 fh 800: ht divisor 1 random type netrand pass val 10 index 5 ref 1 bind 1 installed 49 sec used 25 sec Sent 1680 bytes 20 pkts (dropped 16, overlimits 0 ) - + -------- #alternative: deterministically accept every second packet tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \ 10.0.0.9/32 flowid 1:16 action drop random determ ok 2 - + ping -c 20 10.0.0.9 - + tc -s filter show parent ffff: dev eth0 ----- filter protocol ip pref 6 u32 filter protocol ip pref 6 u32 fh 800: ht divisor 1filter protocol ip pref 6 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:16 (rule hit 20 success 20) @@ -76,4 +76,3 @@ filter protocol ip pref 6 u32 filter protocol ip pref 6 u32 fh 800: ht divisor 1 index 4 ref 1 bind 1 installed 118 sec used 82 sec Sent 1680 bytes 20 pkts (dropped 10, overlimits 0 ) ----- - diff --git a/doc/actions/ifb-README b/doc/actions/ifb-README index 63247f3c..5fe91714 100644 --- a/doc/actions/ifb-README +++ b/doc/actions/ifb-README @@ -6,18 +6,18 @@ with a _lot_ less code. Known IMQ/IFB USES ------------------ -As far as i know the reasons listed below is why people use IMQ. +As far as i know the reasons listed below is why people use IMQ. It would be nice to know of anything else that i missed. 1) qdiscs/policies that are per device as opposed to system wide. IFB allows for sharing. 2) Allows for queueing incoming traffic for shaping instead of -dropping. I am not aware of any study that shows policing is +dropping. I am not aware of any study that shows policing is worse than shaping in achieving the end goal of rate control. I would be interested if anyone is experimenting. -3) Very interesting use: if you are serving p2p you may want to give +3) Very interesting use: if you are serving p2p you may want to give preference to your own locally originated traffic (when responses come back) vs someone using your system to do bittorent. So QoSing based on state comes in as the solution. What people did to achieve this was stick @@ -25,17 +25,17 @@ the IMQ somewhere prelocal hook. I think this is a pretty neat feature to have in Linux in general. (i.e not just for IMQ). But i won't go back to putting netfilter hooks in the device to satisfy -this. I also don't think its worth it hacking ifb some more to be +this. I also don't think its worth it hacking ifb some more to be aware of say L3 info and play ip rule tricks to achieve this. --> Instead the plan is to have a conntrack related action. This action will -selectively either query/create conntrack state on incoming packets. -Packets could then be redirected to ifb based on what happens -> eg -on incoming packets; if we find they are of known state we could send to +selectively either query/create conntrack state on incoming packets. +Packets could then be redirected to ifb based on what happens -> eg +on incoming packets; if we find they are of known state we could send to a different queue than one which didn't have existing state. This all however is dependent on whatever rules the admin enters. At the moment this 3rd function does not exist yet. I have decided that -instead of sitting on the patch for another year, to release it and then +instead of sitting on the patch for another year, to release it and then if there is pressure i will add this feature. An example, to provide functionality that most people use IMQ for below: @@ -43,10 +43,10 @@ An example, to provide functionality that most people use IMQ for below: -------- export TC="/sbin/tc" -$TC qdisc add dev ifb0 root handle 1: prio +$TC qdisc add dev ifb0 root handle 1: prio $TC qdisc add dev ifb0 parent 1:1 handle 10: sfq $TC qdisc add dev ifb0 parent 1:2 handle 20: tbf rate 20kbit buffer 1600 limit 3000 -$TC qdisc add dev ifb0 parent 1:3 handle 30: sfq +$TC qdisc add dev ifb0 parent 1:3 handle 30: sfq $TC filter add dev ifb0 protocol ip pref 1 parent 1: handle 1 fw classid 1:1 $TC filter add dev ifb0 protocol ip pref 2 parent 1: handle 2 fw classid 1:2 @@ -54,7 +54,7 @@ ifconfig ifb0 up $TC qdisc add dev eth0 ingress -# redirect all IP packets arriving in eth0 to ifb0 +# redirect all IP packets arriving in eth0 to ifb0 # use mark 1 --> puts them onto class 1:1 $TC filter add dev eth0 parent ffff: protocol ip prio 10 u32 \ match u32 0 0 flowid 1:1 \ @@ -77,44 +77,44 @@ PING 10.22 (10.0.0.22): 56 data bytes --- 10.22 ping statistics --- 3 packets transmitted, 3 packets received, 0% packet loss round-trip min/avg/max = 0.6/1.3/2.8 ms -[root@jzny action-tests]# +[root@jzny action-tests]# ----- Now look at some stats: --- [root@jmandrake]:~# $TC -s filter show parent ffff: dev eth0 -filter protocol ip pref 10 u32 -filter protocol ip pref 10 u32 fh 800: ht divisor 1 -filter protocol ip pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 +filter protocol ip pref 10 u32 +filter protocol ip pref 10 u32 fh 800: ht divisor 1 +filter protocol ip pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 match 00000000/00000000 at 0 - action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING - target MARK set 0x1 - index 1 ref 1 bind 1 installed 4195sec used 27sec - Sent 252 bytes 3 pkts (dropped 0, overlimits 0) + action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING + target MARK set 0x1 + index 1 ref 1 bind 1 installed 4195sec used 27sec + Sent 252 bytes 3 pkts (dropped 0, overlimits 0) action order 2: mirred (Egress Redirect to device ifb0) stolen index 1 ref 1 bind 1 installed 165 sec used 27 sec - Sent 252 bytes 3 pkts (dropped 0, overlimits 0) + Sent 252 bytes 3 pkts (dropped 0, overlimits 0) [root@jmandrake]:~# $TC -s qdisc -qdisc sfq 30: dev ifb0 limit 128p quantum 1514b - Sent 0 bytes 0 pkts (dropped 0, overlimits 0) -qdisc tbf 20: dev ifb0 rate 20Kbit burst 1575b lat 2147.5s - Sent 210 bytes 3 pkts (dropped 0, overlimits 0) -qdisc sfq 10: dev ifb0 limit 128p quantum 1514b - Sent 294 bytes 3 pkts (dropped 0, overlimits 0) +qdisc sfq 30: dev ifb0 limit 128p quantum 1514b + Sent 0 bytes 0 pkts (dropped 0, overlimits 0) +qdisc tbf 20: dev ifb0 rate 20Kbit burst 1575b lat 2147.5s + Sent 210 bytes 3 pkts (dropped 0, overlimits 0) +qdisc sfq 10: dev ifb0 limit 128p quantum 1514b + Sent 294 bytes 3 pkts (dropped 0, overlimits 0) qdisc prio 1: dev ifb0 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 - Sent 504 bytes 6 pkts (dropped 0, overlimits 0) -qdisc ingress ffff: dev eth0 ---------------- - Sent 308 bytes 5 pkts (dropped 0, overlimits 0) + Sent 504 bytes 6 pkts (dropped 0, overlimits 0) +qdisc ingress ffff: dev eth0 ---------------- + Sent 308 bytes 5 pkts (dropped 0, overlimits 0) [root@jmandrake]:~# ifconfig ifb0 -ifb0 Link encap:Ethernet HWaddr 00:00:00:00:00:00 +ifb0 Link encap:Ethernet HWaddr 00:00:00:00:00:00 inet6 addr: fe80::200:ff:fe00:0/64 Scope:Link UP BROADCAST RUNNING NOARP MTU:1500 Metric:1 RX packets:6 errors:0 dropped:3 overruns:0 frame:0 TX packets:3 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:32 + collisions:0 txqueuelen:32 RX bytes:504 (504.0 b) TX bytes:252 (252.0 b) ----- diff --git a/doc/actions/mirred-usage b/doc/actions/mirred-usage index e749eedc..482ff66d 100644 --- a/doc/actions/mirred-usage +++ b/doc/actions/mirred-usage @@ -7,10 +7,10 @@ flow to be mirrored. High end switches typically can select based on more than just a port (eg a 5 tuple classifier). They may also be capable of redirecting. -Usage: +Usage: -mirred [index INDEX] -where: +mirred [index INDEX] +where: DIRECTION := ACTION := INDEX is the specific policy instance id @@ -18,7 +18,7 @@ DEVICENAME is the devicename Direction: - Ingress is not supported at the moment. It will be in the -future as well as mirror/redirecting to a socket. +future as well as mirror/redirecting to a socket. Action: - Mirror takes a copy of the packet and sends it to specified @@ -29,14 +29,14 @@ steals the packet and redirects to specified destination dev. What NOT to do if you don't want your machine to crash: ------------------------------------------------------ -Do not create loops! +Do not create loops! Loops are not hard to create in the egress qdiscs. Here are simple rules to follow if you don't want to get hurt: A) Do not have the same packet go to same netdevice twice in a single graph of policies. Your machine will just hang! -This is design intent _not a bug_ to teach you some lessons. +This is design intent _not a bug_ to teach you some lessons. In the future if there are easy ways to do this in the kernel without affecting other packets not interested in this feature @@ -51,7 +51,7 @@ B) Do not redirect from one IFB device to another. Remember that IFB is a very specialized case of packet redirecting device. Instead of redirecting it puts packets at the exact spot on the stack it found them from. -Redirecting from ifbX->ifbY will actually not crash your machine but your +Redirecting from ifbX->ifbY will actually not crash your machine but your packets will all be dropped (this is much simpler to detect and resolve and is only affecting users of ifb as opposed to the whole stack). @@ -64,7 +64,7 @@ Some examples: 1) Mirror all packets arriving on eth0 to be sent out on eth1. You may have a sniffer or some accounting box hooked up on eth1. - + --- tc qdisc add dev eth0 ingress tc filter add dev eth0 parent ffff: protocol ip prio 10 u32 \ @@ -100,7 +100,7 @@ stack (i.e ping would work). 3) Even more funky example: # -#allow 1 out 10 packets on ingress of lo to randomly make it to the +#allow 1 out 10 packets on ingress of lo to randomly make it to the # host A (Randomness uses the netrand generator) # --- @@ -111,9 +111,9 @@ action mirred egress mirror dev eth0 --- 4) -# for packets from 10.0.0.9 going out on eth0 (could be local -# IP or something # we are forwarding) - -# if exceeding a 100Kbps rate, then redirect to eth1 +# for packets from 10.0.0.9 going out on eth0 (could be local +# IP or something # we are forwarding) - +# if exceeding a 100Kbps rate, then redirect to eth1 # --- @@ -158,7 +158,7 @@ Essentially a good debugging/logging interface (sort of like BSDs speacialized log device does without needing one). If you replace mirror with redirect, those packets will be -blackholed and will never make it out. +blackholed and will never make it out. cheers, jamal diff --git a/netem/README.distribution b/netem/README.distribution index 23f7ecb7..6d527854 100644 --- a/netem/README.distribution +++ b/netem/README.distribution @@ -1,4 +1,4 @@ -Notes about distribution tables from Nistnet +Notes about distribution tables from Nistnet ------------------------------------------------------------------------------- I. About the distribution tables From db9af1f1e3bc2a64413d3c3cf0f6cc8fde7470ad Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 18 Dec 2018 10:11:10 -0800 Subject: [PATCH 28/50] testsuite: drop unrunnable test The classifier testbed test never worked and was always being skipped. It depended on some files it tests/cls which never made it into the iproute2 git repository. Signed-off-by: Stephen Hemminger Signed-off-by: David Ahern --- testsuite/tests/tc/cls-testbed.t | 73 -------------------------------- 1 file changed, 73 deletions(-) delete mode 100755 testsuite/tests/tc/cls-testbed.t diff --git a/testsuite/tests/tc/cls-testbed.t b/testsuite/tests/tc/cls-testbed.t deleted file mode 100755 index d5c21e5c..00000000 --- a/testsuite/tests/tc/cls-testbed.t +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash -# vim: ft=sh - -. lib/generic.sh - -QDISCS="cbq htb dsmark" - -if [ ! -d tests/cls ]; then - ts_log "tests/cls folder does not exist" - ts_skip -fi - -for q in ${QDISCS}; do - ts_log "Preparing classifier testbed with qdisc $q" - - for c in tests/cls/*.c; do - - case "$q" in - cbq) - ts_tc "cls-testbed" "cbq root qdisc creation" \ - qdisc add dev $DEV root handle 10:0 \ - cbq bandwidth 100Mbit avpkt 1400 mpu 64 - ts_tc "cls-testbed" "cbq root class creation" \ - class add dev $DEV parent 10:0 classid 10:12 \ - cbq bandwidth 100mbit rate 100mbit allot 1514 prio 3 \ - maxburst 1 avpkt 500 bounded - ;; - htb) - ts_qdisc_available "htb" - if [ $? -eq 0 ]; then - ts_log "cls-testbed: HTB is unsupported by $TC, skipping" - continue; - fi - ts_tc "cls-testbed" "htb root qdisc creation" \ - qdisc add dev $DEV root handle 10:0 htb - ts_tc "cls-testbed" "htb root class creation" \ - class add dev $DEV parent 10:0 classid 10:12 \ - htb rate 100Mbit quantum 1514 - ;; - dsmark) - ts_qdisc_available "dsmark" - if [ $? -eq 0 ]; then - ts_log "cls-testbed: dsmark is unsupported by $TC, skipping" - continue; - fi - ts_tc "cls-testbed" "dsmark root qdisc creation" \ - qdisc add dev $DEV root handle 20:0 \ - dsmark indices 64 default_index 1 set_tc_index - ts_tc "cls-testbed" "dsmark class creation" \ - class change dev $DEV parent 20:0 classid 20:12 \ - dsmark mask 0xff value 2 - ts_tc "cls-testbed" "prio inner qdisc creation" \ - qdisc add dev $DEV parent 20:0 handle 10:0 prio - ;; - *) - ts_err "cls-testbed: no testbed configuration found for qdisc $q" - continue - ;; - esac - - ts_tc "cls-testbed" "tree listing" qdisc list dev eth0 - ts_tc "cls-testbed" "tree class listing" class list dev eth0 - ts_log "cls-testbed: starting classifier test $c" - $c - - case "$q" in - *) - ts_tc "cls-testbed" "generic qdisc tree deletion" \ - qdisc del dev $DEV root - ;; - esac - done -done From 17689d3075c8b9a29b8f398a57defb9dcabafe81 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 19 Dec 2018 12:47:29 -0800 Subject: [PATCH 29/50] Update kernel headers Update kernel headers to commit 055722716c39 ("tipc: fix uninitialized value for broadcast retransmission") Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 177 +++++++++++++++++++++-------- include/uapi/linux/btf.h | 18 ++- include/uapi/linux/if_bridge.h | 21 ++++ include/uapi/linux/if_link.h | 1 + include/uapi/linux/if_tun.h | 1 + include/uapi/linux/neighbour.h | 1 + include/uapi/linux/net_namespace.h | 2 + include/uapi/linux/snmp.h | 1 + 8 files changed, 172 insertions(+), 50 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ff651ca6..960a7f0a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -232,6 +232,20 @@ enum bpf_attach_type { */ #define BPF_F_STRICT_ALIGNMENT (1U << 0) +/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the + * verifier will allow any alignment whatsoever. On platforms + * with strict alignment requirements for loads ands stores (such + * as sparc and mips) the verifier validates that all loads and + * stores provably follow this requirement. This flag turns that + * checking and enforcement off. + * + * It is mostly used for testing when we want to validate the + * context and memory access aspects of the verifier, but because + * of an unaligned access the alignment check would trigger before + * the one we are interested in. + */ +#define BPF_F_ANY_ALIGNMENT (1U << 1) + /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ #define BPF_PSEUDO_MAP_FD 1 @@ -257,9 +271,6 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) -/* flags for BPF_PROG_QUERY */ -#define BPF_F_QUERY_EFFECTIVE (1U << 0) - #define BPF_OBJ_NAME_LEN 16U /* Flags for accessing BPF object */ @@ -269,6 +280,12 @@ enum bpf_attach_type { /* Flag for stack_map, store build_id+offset instead of pointer */ #define BPF_F_STACK_BUILD_ID (1U << 5) +/* Zero-initialize hash function seed. This should only be used for testing. */ +#define BPF_F_ZERO_SEED (1U << 6) + +/* flags for BPF_PROG_QUERY */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, @@ -335,6 +352,13 @@ union bpf_attr { * (context accesses, allowed helpers, etc). */ __u32 expected_attach_type; + __u32 prog_btf_fd; /* fd pointing to BTF type data */ + __u32 func_info_rec_size; /* userspace bpf_func_info size */ + __aligned_u64 func_info; /* func info */ + __u32 func_info_cnt; /* number of bpf_func_info records */ + __u32 line_info_rec_size; /* userspace bpf_line_info size */ + __aligned_u64 line_info; /* line info */ + __u32 line_info_cnt; /* number of bpf_line_info records */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -353,8 +377,11 @@ union bpf_attr { struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ __u32 prog_fd; __u32 retval; - __u32 data_size_in; - __u32 data_size_out; + __u32 data_size_in; /* input: len of data_in */ + __u32 data_size_out; /* input/output: len of data_out + * returns ENOSPC if data_out + * is too small. + */ __aligned_u64 data_in; __aligned_u64 data_out; __u32 repeat; @@ -475,18 +502,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_pop_elem(struct bpf_map *map, void *value) - * Description - * Pop an element from *map*. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_map_peek_elem(struct bpf_map *map, void *value) - * Description - * Get an element from *map* without removing it. - * Return - * 0 on success, or a negative error in case of failure. - * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -1910,9 +1925,9 @@ union bpf_attr { * is set to metric from route (IPv4/IPv6 only), and ifindex * is set to the device index of the nexthop from the FIB lookup. * - * *plen* argument is the size of the passed in struct. - * *flags* argument can be a combination of one or more of the - * following values: + * *plen* argument is the size of the passed in struct. + * *flags* argument can be a combination of one or more of the + * following values: * * **BPF_FIB_LOOKUP_DIRECT** * Do a direct table lookup vs full lookup using FIB @@ -1921,9 +1936,9 @@ union bpf_attr { * Perform lookup from an egress perspective (default is * ingress). * - * *ctx* is either **struct xdp_md** for XDP programs or - * **struct sk_buff** tc cls_act programs. - * Return + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * Return * * < 0 if any input argument is invalid * * 0 on success (packet is forwarded, nexthop neighbor exists) * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the @@ -2068,8 +2083,8 @@ union bpf_attr { * translated to a keycode using the rc keymap, and reported as * an input key down event. After a period a key up event is * generated. This period can be extended by calling either - * **bpf_rc_keydown** () again with the same values, or calling - * **bpf_rc_repeat** (). + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). * * Some protocols include a toggle bit, in case the button was * released and pressed again between consecutive scancodes. @@ -2152,21 +2167,22 @@ union bpf_attr { * The *flags* meaning is specific for each map type, * and has to be 0 for cgroup local storage. * - * Depending on the bpf program type, a local storage area - * can be shared between multiple instances of the bpf program, + * Depending on the BPF program type, a local storage area + * can be shared between multiple instances of the BPF program, * running simultaneously. * * A user should care about the synchronization by himself. - * For example, by using the BPF_STX_XADD instruction to alter + * For example, by using the **BPF_STX_XADD** instruction to alter * the shared data. * Return - * Pointer to the local storage area. + * A pointer to the local storage area. * * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description - * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map - * It checks the selected sk is matching the incoming - * request in the skb. + * Select a **SO_REUSEPORT** socket from a + * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. + * It checks the selected socket is matching the incoming + * request in the socket buffer. * Return * 0 on success, or a negative error in case of failure. * @@ -2174,7 +2190,7 @@ union bpf_attr { * Description * Look for TCP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, - * and if non-NULL, released via **bpf_sk_release**\ (). + * and if non-**NULL**, released via **bpf_sk_release**\ (). * * The *ctx* should point to the context of the program, such as * the skb or socket (depending on the hook in use). This is used @@ -2202,15 +2218,15 @@ union bpf_attr { * This helper is available only if the kernel was compiled with * **CONFIG_NET** configuration option. * Return - * Pointer to *struct bpf_sock*, or NULL in case of failure. - * For sockets with reuseport option, the *struct bpf_sock* - * result is from reuse->socks[] using the hash of the tuple. + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from **reuse->socks**\ [] using the hash of the tuple. * * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for UDP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, - * and if non-NULL, released via **bpf_sk_release**\ (). + * and if non-**NULL**, released via **bpf_sk_release**\ (). * * The *ctx* should point to the context of the program, such as * the skb or socket (depending on the hook in use). This is used @@ -2238,33 +2254,71 @@ union bpf_attr { * This helper is available only if the kernel was compiled with * **CONFIG_NET** configuration option. * Return - * Pointer to *struct bpf_sock*, or NULL in case of failure. - * For sockets with reuseport option, the *struct bpf_sock* - * result is from reuse->socks[] using the hash of the tuple. + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from **reuse->socks**\ [] using the hash of the tuple. * - * int bpf_sk_release(struct bpf_sock *sk) + * int bpf_sk_release(struct bpf_sock *sock) * Description - * Release the reference held by *sock*. *sock* must be a non-NULL - * pointer that was returned from bpf_sk_lookup_xxx\ (). + * Release the reference held by *sock*. *sock* must be a + * non-**NULL** pointer that was returned from + * **bpf_sk_lookup_xxx**\ (). * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) * Description - * For socket policies, insert *len* bytes into msg at offset + * For socket policies, insert *len* bytes into *msg* at offset * *start*. * * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a - * *msg* it may want to insert metadata or options into the msg. + * *msg* it may want to insert metadata or options into the *msg*. * This can later be read and used by any of the lower layer BPF * hooks. * * This helper may fail if under memory pressure (a malloc * fails) in these cases BPF programs will get an appropriate * error and BPF programs will need to handle them. - * * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags) + * Description + * Will remove *pop* bytes from a *msg* starting at byte *start*. + * This may result in **ENOMEM** errors under certain situations if + * an allocation and copy are required due to a full ring buffer. + * However, the helper will try to avoid doing the allocation + * if possible. Other errors can occur if input parameters are + * invalid either due to *start* byte not being valid part of *msg* + * payload and/or *pop* value being to large. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded pointer movement. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2357,7 +2411,9 @@ union bpf_attr { FN(map_push_elem), \ FN(map_pop_elem), \ FN(map_peek_elem), \ - FN(msg_push_data), + FN(msg_push_data), \ + FN(msg_pop_data), \ + FN(rc_pointer_rel), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2474,6 +2530,8 @@ struct __sk_buff { __u32 data_meta; __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); + __u64 tstamp; + __u32 wire_len; }; struct bpf_tunnel_key { @@ -2649,6 +2707,16 @@ struct bpf_prog_info { __u32 nr_jited_func_lens; __aligned_u64 jited_ksyms; __aligned_u64 jited_func_lens; + __u32 btf_id; + __u32 func_info_rec_size; + __aligned_u64 func_info; + __u32 nr_func_info; + __u32 nr_line_info; + __aligned_u64 line_info; + __aligned_u64 jited_line_info; + __u32 nr_jited_line_info; + __u32 line_info_rec_size; + __u32 jited_line_info_rec_size; } __attribute__((aligned(8))); struct bpf_map_info { @@ -2960,4 +3028,19 @@ struct bpf_flow_keys { }; }; +struct bpf_func_info { + __u32 insn_off; + __u32 type_id; +}; + +#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) +#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) + +struct bpf_line_info { + __u32 insn_off; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; +}; + #endif /* __LINUX_BPF_H__ */ diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 8d2a8ffa..f43d5a8e 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -40,7 +40,8 @@ struct btf_type { /* "size" is used by INT, ENUM, STRUCT and UNION. * "size" tells the size of the type it is describing. * - * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT. + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC and FUNC_PROTO. * "type" is a type_id referring to another type. */ union { @@ -64,8 +65,10 @@ struct btf_type { #define BTF_KIND_VOLATILE 9 /* Volatile */ #define BTF_KIND_CONST 10 /* Const */ #define BTF_KIND_RESTRICT 11 /* Restrict */ -#define BTF_KIND_MAX 11 -#define NR_BTF_KINDS 12 +#define BTF_KIND_FUNC 12 /* Function */ +#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ +#define BTF_KIND_MAX 13 +#define NR_BTF_KINDS 14 /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. @@ -110,4 +113,13 @@ struct btf_member { __u32 offset; /* offset in bits */ }; +/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". + * The exact number of btf_param is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_param { + __u32 name_off; + __u32 type; +}; + #endif /* __LINUX_BTF_H__ */ diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index bdfecf94..04f763cf 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -292,4 +292,25 @@ struct br_mcast_stats { __u64 mcast_bytes[BR_MCAST_DIR_SIZE]; __u64 mcast_packets[BR_MCAST_DIR_SIZE]; }; + +/* bridge boolean options + * BR_BOOLOPT_NO_LL_LEARN - disable learning from link-local packets + * + * IMPORTANT: if adding a new option do not forget to handle + * it in br_boolopt_toggle/get and bridge sysfs + */ +enum br_boolopt_id { + BR_BOOLOPT_NO_LL_LEARN, + BR_BOOLOPT_MAX +}; + +/* struct br_boolopt_multi - change multiple bridge boolean options + * + * @optval: new option values (bit per option) + * @optmask: options to change (bit per option) + */ +struct br_boolopt_multi { + __u32 optval; + __u32 optmask; +}; #endif /* _LINUX_IF_BRIDGE_H */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index e1ef848a..484ddf83 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -286,6 +286,7 @@ enum { IFLA_BR_MCAST_IGMP_VERSION, IFLA_BR_MCAST_MLD_VERSION, IFLA_BR_VLAN_STATS_PER_PORT, + IFLA_BR_MULTI_BOOLOPT, __IFLA_BR_MAX, }; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index be9b744a..2f011655 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -59,6 +59,7 @@ #define TUNGETVNETBE _IOR('T', 223, int) #define TUNSETSTEERINGEBPF _IOR('T', 224, int) #define TUNSETFILTEREBPF _IOR('T', 225, int) +#define TUNSETCARRIER _IOW('T', 226, int) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 99815544..cd144e30 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -28,6 +28,7 @@ enum { NDA_MASTER, NDA_LINK_NETNSID, NDA_SRC_VNI, + NDA_PROTOCOL, /* Originator of entry */ __NDA_MAX }; diff --git a/include/uapi/linux/net_namespace.h b/include/uapi/linux/net_namespace.h index 6d64d071..fa81f1e5 100644 --- a/include/uapi/linux/net_namespace.h +++ b/include/uapi/linux/net_namespace.h @@ -16,6 +16,8 @@ enum { NETNSA_NSID, NETNSA_PID, NETNSA_FD, + NETNSA_TARGET_NSID, + NETNSA_CURRENT_NSID, __NETNSA_MAX, }; diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index f80135e5..86dc24a9 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -243,6 +243,7 @@ enum LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */ LINUX_MIB_TCPRETRANSFAIL, /* TCPRetransFail */ LINUX_MIB_TCPRCVCOALESCE, /* TCPRcvCoalesce */ + LINUX_MIB_TCPBACKLOGCOALESCE, /* TCPBacklogCoalesce */ LINUX_MIB_TCPOFOQUEUE, /* TCPOFOQueue */ LINUX_MIB_TCPOFODROP, /* TCPOFODrop */ LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */ From fdce94d0d174c818b45688ef783c6fc5b69affc8 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 22 Dec 2018 07:36:52 -0800 Subject: [PATCH 30/50] Update kernel headers Update kernel headers to commit ce28bb445388 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 13 ++++++++- include/uapi/linux/btf.h | 20 ++++++++++++-- include/uapi/linux/if_tunnel.h | 20 ++++++++++++++ include/uapi/linux/in.h | 10 ++++--- include/uapi/linux/netfilter.h | 4 --- include/uapi/linux/netfilter/ipset/ip_set.h | 19 +++++++++----- include/uapi/linux/netfilter_ipv4.h | 28 -------------------- include/uapi/linux/netfilter_ipv6.h | 29 --------------------- include/uapi/linux/netlink.h | 2 +- 9 files changed, 70 insertions(+), 75 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 960a7f0a..fb541e16 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,14 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK, }; +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, @@ -343,7 +351,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ - __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 kern_version; /* not used */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ @@ -2657,6 +2665,7 @@ struct sk_msg_md { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + __u32 size; /* Total size of sk_msg */ }; struct sk_reuseport_md { @@ -2717,6 +2726,8 @@ struct bpf_prog_info { __u32 nr_jited_line_info; __u32 line_info_rec_size; __u32 jited_line_info_rec_size; + __u32 nr_prog_tags; + __aligned_u64 prog_tags; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index f43d5a8e..cb4cf8cc 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -34,7 +34,9 @@ struct btf_type { * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-31: unused + * bits 28-30: unused + * bit 31: kind_flag, currently used by + * struct, union and fwd */ __u32 info; /* "size" is used by INT, ENUM, STRUCT and UNION. @@ -52,6 +54,7 @@ struct btf_type { #define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KFLAG(info) ((info) >> 31) #define BTF_KIND_UNKN 0 /* Unknown */ #define BTF_KIND_INT 1 /* Integer */ @@ -110,9 +113,22 @@ struct btf_array { struct btf_member { __u32 name_off; __u32 type; - __u32 offset; /* offset in bits */ + /* If the type info kind_flag is set, the btf_member offset + * contains both member bitfield size and bit offset. The + * bitfield size is set for bitfield members. If the type + * info kind_flag is not set, the offset contains only bit + * offset. + */ + __u32 offset; }; +/* If the struct/union type info kind_flag is set, the + * following two macros are used to access bitfield_size + * and bit_offset from btf_member.offset. + */ +#define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) +#define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) + /* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". * The exact number of btf_param is stored in the vlen (of the * info in "struct btf_type"). diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index ecdc7666..c7f0a5e6 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -160,4 +160,24 @@ enum { }; #define IFLA_VTI_MAX (__IFLA_VTI_MAX - 1) + +#define TUNNEL_CSUM __cpu_to_be16(0x01) +#define TUNNEL_ROUTING __cpu_to_be16(0x02) +#define TUNNEL_KEY __cpu_to_be16(0x04) +#define TUNNEL_SEQ __cpu_to_be16(0x08) +#define TUNNEL_STRICT __cpu_to_be16(0x10) +#define TUNNEL_REC __cpu_to_be16(0x20) +#define TUNNEL_VERSION __cpu_to_be16(0x40) +#define TUNNEL_NO_KEY __cpu_to_be16(0x80) +#define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100) +#define TUNNEL_OAM __cpu_to_be16(0x0200) +#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) +#define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800) +#define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) +#define TUNNEL_NOCACHE __cpu_to_be16(0x2000) +#define TUNNEL_ERSPAN_OPT __cpu_to_be16(0x4000) + +#define TUNNEL_OPTIONS_PRESENT \ + (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT) + #endif /* _IF_TUNNEL_H_ */ diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index a4f143b3..92242d20 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -266,10 +266,14 @@ struct sockaddr_in { #define IN_CLASSD(a) ((((long int) (a)) & 0xf0000000) == 0xe0000000) #define IN_MULTICAST(a) IN_CLASSD(a) -#define IN_MULTICAST_NET 0xF0000000 +#define IN_MULTICAST_NET 0xe0000000 -#define IN_EXPERIMENTAL(a) ((((long int) (a)) & 0xf0000000) == 0xf0000000) -#define IN_BADCLASS(a) IN_EXPERIMENTAL((a)) +#define IN_BADCLASS(a) ((((long int) (a) ) == 0xffffffff) +#define IN_EXPERIMENTAL(a) IN_BADCLASS((a)) + +#define IN_CLASSE(a) ((((long int) (a)) & 0xf0000000) == 0xf0000000) +#define IN_CLASSE_NET 0xffffffff +#define IN_CLASSE_NSHIFT 0 /* Address to accept any incoming messages. */ #define INADDR_ANY ((unsigned long int) 0x00000000) diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index 36378a0a..899be986 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -33,10 +33,6 @@ #define NF_DROP_ERR(x) (((-x) << 16) | NF_DROP) /* only for userspace compatibility */ -/* Generic cache responses from hook functions. - <= 0x2000 is used for protocol-flags. */ -#define NFC_UNKNOWN 0x4000 -#define NFC_ALTERED 0x8000 /* NF_VERDICT_BITS should be 8 now, but userspace might break if this changes */ #define NF_VERDICT_BITS 16 diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 13eeada5..153c517a 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -13,8 +13,9 @@ #include -/* The protocol version */ -#define IPSET_PROTOCOL 6 +/* The protocol versions */ +#define IPSET_PROTOCOL 7 +#define IPSET_PROTOCOL_MIN 6 /* The max length of strings including NUL: set and type identifiers */ #define IPSET_MAXNAMELEN 32 @@ -38,17 +39,19 @@ enum ipset_cmd { IPSET_CMD_TEST, /* 11: Test an element in a set */ IPSET_CMD_HEADER, /* 12: Get set header data only */ IPSET_CMD_TYPE, /* 13: Get set type */ + IPSET_CMD_GET_BYNAME, /* 14: Get set index by name */ + IPSET_CMD_GET_BYINDEX, /* 15: Get set name by index */ IPSET_MSG_MAX, /* Netlink message commands */ /* Commands in userspace: */ - IPSET_CMD_RESTORE = IPSET_MSG_MAX, /* 14: Enter restore mode */ - IPSET_CMD_HELP, /* 15: Get help */ - IPSET_CMD_VERSION, /* 16: Get program version */ - IPSET_CMD_QUIT, /* 17: Quit from interactive mode */ + IPSET_CMD_RESTORE = IPSET_MSG_MAX, /* 16: Enter restore mode */ + IPSET_CMD_HELP, /* 17: Get help */ + IPSET_CMD_VERSION, /* 18: Get program version */ + IPSET_CMD_QUIT, /* 19: Quit from interactive mode */ IPSET_CMD_MAX, - IPSET_CMD_COMMIT = IPSET_CMD_MAX, /* 18: Commit buffered commands */ + IPSET_CMD_COMMIT = IPSET_CMD_MAX, /* 20: Commit buffered commands */ }; /* Attributes at command level */ @@ -66,6 +69,7 @@ enum { IPSET_ATTR_LINENO, /* 9: Restore lineno */ IPSET_ATTR_PROTOCOL_MIN, /* 10: Minimal supported version number */ IPSET_ATTR_REVISION_MIN = IPSET_ATTR_PROTOCOL_MIN, /* type rev min */ + IPSET_ATTR_INDEX, /* 11: Kernel index of set */ __IPSET_ATTR_CMD_MAX, }; #define IPSET_ATTR_CMD_MAX (__IPSET_ATTR_CMD_MAX - 1) @@ -223,6 +227,7 @@ enum ipset_adt { /* Sets are identified by an index in kernel space. Tweak with ip_set_id_t * and IPSET_INVALID_ID if you want to increase the max number of sets. + * Also, IPSET_ATTR_INDEX must be changed. */ typedef __u16 ip_set_id_t; diff --git a/include/uapi/linux/netfilter_ipv4.h b/include/uapi/linux/netfilter_ipv4.h index 074e2c8b..96979e37 100644 --- a/include/uapi/linux/netfilter_ipv4.h +++ b/include/uapi/linux/netfilter_ipv4.h @@ -12,34 +12,6 @@ #include /* for INT_MIN, INT_MAX */ -/* IP Cache bits. */ -/* Src IP address. */ -#define NFC_IP_SRC 0x0001 -/* Dest IP address. */ -#define NFC_IP_DST 0x0002 -/* Input device. */ -#define NFC_IP_IF_IN 0x0004 -/* Output device. */ -#define NFC_IP_IF_OUT 0x0008 -/* TOS. */ -#define NFC_IP_TOS 0x0010 -/* Protocol. */ -#define NFC_IP_PROTO 0x0020 -/* IP options. */ -#define NFC_IP_OPTIONS 0x0040 -/* Frag & flags. */ -#define NFC_IP_FRAG 0x0080 - -/* Per-protocol information: only matters if proto match. */ -/* TCP flags. */ -#define NFC_IP_TCPFLAGS 0x0100 -/* Source port. */ -#define NFC_IP_SRC_PT 0x0200 -/* Dest port. */ -#define NFC_IP_DST_PT 0x0400 -/* Something else about the proto */ -#define NFC_IP_PROTO_UNKNOWN 0x2000 - /* IP Hooks */ /* After promisc drops, checksum checks. */ #define NF_IP_PRE_ROUTING 0 diff --git a/include/uapi/linux/netfilter_ipv6.h b/include/uapi/linux/netfilter_ipv6.h index 92701fe8..eedf7a2d 100644 --- a/include/uapi/linux/netfilter_ipv6.h +++ b/include/uapi/linux/netfilter_ipv6.h @@ -15,35 +15,6 @@ #include /* for INT_MIN, INT_MAX */ -/* IP Cache bits. */ -/* Src IP address. */ -#define NFC_IP6_SRC 0x0001 -/* Dest IP address. */ -#define NFC_IP6_DST 0x0002 -/* Input device. */ -#define NFC_IP6_IF_IN 0x0004 -/* Output device. */ -#define NFC_IP6_IF_OUT 0x0008 -/* TOS. */ -#define NFC_IP6_TOS 0x0010 -/* Protocol. */ -#define NFC_IP6_PROTO 0x0020 -/* IP options. */ -#define NFC_IP6_OPTIONS 0x0040 -/* Frag & flags. */ -#define NFC_IP6_FRAG 0x0080 - - -/* Per-protocol information: only matters if proto match. */ -/* TCP flags. */ -#define NFC_IP6_TCPFLAGS 0x0100 -/* Source port. */ -#define NFC_IP6_SRC_PT 0x0200 -/* Dest port. */ -#define NFC_IP6_DST_PT 0x0400 -/* Something else about the proto */ -#define NFC_IP6_PROTO_UNKNOWN 0x2000 - /* IP6 Hooks */ /* After promisc drops, checksum checks. */ #define NF_IP6_PRE_ROUTING 0 diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 2966171b..2c28d329 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -153,7 +153,7 @@ enum nlmsgerr_attrs { #define NETLINK_LIST_MEMBERSHIPS 9 #define NETLINK_CAP_ACK 10 #define NETLINK_EXT_ACK 11 -#define NETLINK_DUMP_STRICT_CHK 12 +#define NETLINK_GET_STRICT_CHK 12 struct nl_pktinfo { __u32 group; From 2750252d7e850793a7d50987abb974141eb8525e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 4 Oct 2018 14:07:15 -0700 Subject: [PATCH 31/50] libnetlink: dump extack string in done message Print any extack message that has been appended to a NLMSG_DONE message. To avoid duplication, move the existing print code to a new helper. Signed-off-by: David Ahern --- lib/libnetlink.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 95457109..e28e1ab6 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -67,6 +67,14 @@ static int err_attr_cb(const struct nlattr *attr, void *data) return MNL_CB_OK; } +static void print_ext_ack_msg(bool is_err, const char *msg) +{ + fprintf(stderr, "%s: %s", is_err ? "Error" : "Warning", msg); + if (msg[strlen(msg) - 1] != '.') + fprintf(stderr, "."); + fprintf(stderr, "\n"); +} + /* dump netlink extended ack error message */ int nl_dump_ext_ack(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn) { @@ -108,12 +116,29 @@ int nl_dump_ext_ack(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn) if (msg && *msg != '\0') { bool is_err = !!err->error; - fprintf(stderr, "%s: %s", - is_err ? "Error" : "Warning", msg); - if (msg[strlen(msg) - 1] != '.') - fprintf(stderr, "."); - fprintf(stderr, "\n"); + print_ext_ack_msg(is_err, msg); + return is_err ? 1 : 0; + } + return 0; +} + +static int nl_dump_ext_ack_done(const struct nlmsghdr *nlh, int error) +{ + struct nlattr *tb[NLMSGERR_ATTR_MAX + 1] = {}; + unsigned int hlen = sizeof(int); + const char *msg = NULL; + + if (mnl_attr_parse(nlh, hlen, err_attr_cb, tb) != MNL_CB_OK) + return 0; + + if (tb[NLMSGERR_ATTR_MSG]) + msg = mnl_attr_get_str(tb[NLMSGERR_ATTR_MSG]); + + if (msg && *msg != '\0') { + bool is_err = !!error; + + print_ext_ack_msg(is_err, msg); return is_err ? 1 : 0; } @@ -127,6 +152,11 @@ int nl_dump_ext_ack(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn) { return 0; } + +static int nl_dump_ext_ack_done(const struct nlmsghdr *nlh, int error) +{ + return 0; +} #endif void rtnl_close(struct rtnl_handle *rth) @@ -512,6 +542,10 @@ static int rtnl_dump_done(struct nlmsghdr *h) } if (len < 0) { + /* check for any messages returned from kernel */ + if (nl_dump_ext_ack_done(h, len)) + return len; + errno = -len; switch (errno) { case ENOENT: From 92e03242c4c0a5c6ac3571153dc7323b0b45a430 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 19 Oct 2018 15:34:44 -0700 Subject: [PATCH 32/50] libnetlink: Use NLMSG_LENGTH to set nlmsg_len Change nlmsg_len from sizeof(req) to use NLMSG_LENGTH on the header. 2 of the inner headers are not 4-byte aligned, so add a 0-length buf after the header with the __aligned(NLMSG_ALIGNTO) to ensure the size of the request is large enough. Use NLMSG_ALIGN in NLMSG_LENGTH to set nlmsg_len. Signed-off-by: David Ahern --- lib/libnetlink.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/lib/libnetlink.c b/lib/libnetlink.c index e28e1ab6..8ab2355f 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -28,6 +28,8 @@ #include "libnetlink.h" +#define __aligned(x) __attribute__((aligned(x))) + #ifndef SOL_NETLINK #define SOL_NETLINK 270 #endif @@ -238,7 +240,7 @@ int rtnl_addrdump_req(struct rtnl_handle *rth, int family) struct nlmsghdr nlh; struct ifaddrmsg ifm; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), .nlh.nlmsg_type = RTM_GETADDR, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -254,7 +256,7 @@ int rtnl_addrlbldump_req(struct rtnl_handle *rth, int family) struct nlmsghdr nlh; struct ifaddrlblmsg ifal; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrlblmsg)), .nlh.nlmsg_type = RTM_GETADDRLABEL, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -270,7 +272,7 @@ int rtnl_routedump_req(struct rtnl_handle *rth, int family) struct nlmsghdr nlh; struct rtmsg rtm; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), .nlh.nlmsg_type = RTM_GETROUTE, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -286,7 +288,7 @@ int rtnl_ruledump_req(struct rtnl_handle *rth, int family) struct nlmsghdr nlh; struct fib_rule_hdr frh; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)), .nlh.nlmsg_type = RTM_GETRULE, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -302,7 +304,7 @@ int rtnl_neighdump_req(struct rtnl_handle *rth, int family) struct nlmsghdr nlh; struct ndmsg ndm; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), .nlh.nlmsg_type = RTM_GETNEIGH, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -318,7 +320,7 @@ int rtnl_neightbldump_req(struct rtnl_handle *rth, int family) struct nlmsghdr nlh; struct ndtmsg ndtmsg; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndtmsg)), .nlh.nlmsg_type = RTM_GETNEIGHTBL, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -334,7 +336,7 @@ int rtnl_mdbdump_req(struct rtnl_handle *rth, int family) struct nlmsghdr nlh; struct br_port_msg bpm; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct br_port_msg)), .nlh.nlmsg_type = RTM_GETMDB, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -349,8 +351,9 @@ int rtnl_netconfdump_req(struct rtnl_handle *rth, int family) struct { struct nlmsghdr nlh; struct netconfmsg ncm; + char buf[0] __aligned(NLMSG_ALIGNTO); } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct netconfmsg))), .nlh.nlmsg_type = RTM_GETNETCONF, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -365,8 +368,9 @@ int rtnl_nsiddump_req(struct rtnl_handle *rth, int family) struct { struct nlmsghdr nlh; struct rtgenmsg rtm; + char buf[0] __aligned(NLMSG_ALIGNTO); } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct rtgenmsg))), .nlh.nlmsg_type = RTM_GETNSID, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -388,7 +392,7 @@ int rtnl_linkdump_req_filter(struct rtnl_handle *rth, int family, struct nlmsghdr nlh; struct ifinfomsg ifm; /* attribute has to be NLMSG aligned */ - struct rtattr ext_req __attribute__ ((aligned(NLMSG_ALIGNTO))); + struct rtattr ext_req __aligned(NLMSG_ALIGNTO); __u32 ext_filter_mask; } req = { .nlh.nlmsg_len = sizeof(req), From d97b16b2c90614b784f045f070f4326f5a3b5eaa Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 4 Oct 2018 14:12:39 -0700 Subject: [PATCH 33/50] libnetlink: linkdump_req: Only AF_UNSPEC family expects an ext_filter_mask Only AF_UNSPEC handled by rtnl_dump_ifinfo expects an ext_filter_mask on a dump request. Update the linkdump request functions to only set and send ext_filter_mask for AF_UNSPEC. Signed-off-by: David Ahern --- lib/libnetlink.c | 103 ++++++++++++++++++++++++++++++----------------- 1 file changed, 65 insertions(+), 38 deletions(-) diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 8ab2355f..b9c37fd3 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -380,41 +380,11 @@ int rtnl_nsiddump_req(struct rtnl_handle *rth, int family) return send(rth->fd, &req, sizeof(req), 0); } -int rtnl_linkdump_req(struct rtnl_handle *rth, int family) -{ - return rtnl_linkdump_req_filter(rth, family, RTEXT_FILTER_VF); -} - -int rtnl_linkdump_req_filter(struct rtnl_handle *rth, int family, - __u32 filt_mask) +static int __rtnl_linkdump_req(struct rtnl_handle *rth, int family) { struct { struct nlmsghdr nlh; struct ifinfomsg ifm; - /* attribute has to be NLMSG aligned */ - struct rtattr ext_req __aligned(NLMSG_ALIGNTO); - __u32 ext_filter_mask; - } req = { - .nlh.nlmsg_len = sizeof(req), - .nlh.nlmsg_type = RTM_GETLINK, - .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, - .nlh.nlmsg_seq = rth->dump = ++rth->seq, - .ifm.ifi_family = family, - .ext_req.rta_type = IFLA_EXT_MASK, - .ext_req.rta_len = RTA_LENGTH(sizeof(__u32)), - .ext_filter_mask = filt_mask, - }; - - return send(rth->fd, &req, sizeof(req), 0); -} - -int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int family, - req_filter_fn_t filter_fn) -{ - struct { - struct nlmsghdr nlh; - struct ifinfomsg ifm; - char buf[1024]; } req = { .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), .nlh.nlmsg_type = RTM_GETLINK, @@ -422,16 +392,73 @@ int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int family, .nlh.nlmsg_seq = rth->dump = ++rth->seq, .ifm.ifi_family = family, }; - int err; - if (!filter_fn) - return -EINVAL; + return send(rth->fd, &req, sizeof(req), 0); +} - err = filter_fn(&req.nlh, sizeof(req)); - if (err) - return err; +int rtnl_linkdump_req(struct rtnl_handle *rth, int family) +{ + if (family == AF_UNSPEC) + return rtnl_linkdump_req_filter(rth, family, RTEXT_FILTER_VF); - return send(rth->fd, &req, req.nlh.nlmsg_len, 0); + return __rtnl_linkdump_req(rth, family); +} + +int rtnl_linkdump_req_filter(struct rtnl_handle *rth, int family, + __u32 filt_mask) +{ + if (family == AF_UNSPEC) { + struct { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + /* attribute has to be NLMSG aligned */ + struct rtattr ext_req __aligned(NLMSG_ALIGNTO); + __u32 ext_filter_mask; + } req = { + .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_type = RTM_GETLINK, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .nlh.nlmsg_seq = rth->dump = ++rth->seq, + .ifm.ifi_family = family, + .ext_req.rta_type = IFLA_EXT_MASK, + .ext_req.rta_len = RTA_LENGTH(sizeof(__u32)), + .ext_filter_mask = filt_mask, + }; + + return send(rth->fd, &req, sizeof(req), 0); + } + + return __rtnl_linkdump_req(rth, family); +} + +int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) +{ + if (family == AF_UNSPEC) { + struct { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + char buf[1024]; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nlh.nlmsg_type = RTM_GETLINK, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .nlh.nlmsg_seq = rth->dump = ++rth->seq, + .ifm.ifi_family = family, + }; + int err; + + if (!filter_fn) + return -EINVAL; + + err = filter_fn(&req.nlh, sizeof(req)); + if (err) + return err; + + return send(rth->fd, &req, req.nlh.nlmsg_len, 0); + } + + return __rtnl_linkdump_req(rth, family); } int rtnl_statsdump_req_filter(struct rtnl_handle *rth, int fam, __u32 filt_mask) From 43fd93ae46ad223dd558e00169a05c9b5b617b64 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 Oct 2018 19:08:15 -0700 Subject: [PATCH 34/50] ip route: Remove rtnl_rtcache_request Add a filter option to rtnl_routedump_req and use it to set rtm_flags removing the need for rtnl_rtcache_request for dump requests. Signed-off-by: David Ahern --- include/libnetlink.h | 7 ++++--- ip/ipmroute.c | 2 +- ip/iproute.c | 43 ++++++++++++++----------------------------- lib/libnetlink.c | 12 +++++++++++- 4 files changed, 30 insertions(+), 34 deletions(-) diff --git a/include/libnetlink.h b/include/libnetlink.h index 138840d5..b0051f39 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -47,11 +47,14 @@ int rtnl_open_byproto(struct rtnl_handle *rth, unsigned int subscriptions, void rtnl_close(struct rtnl_handle *rth); +typedef int (*req_filter_fn_t)(struct nlmsghdr *nlh, int reqlen); + int rtnl_addrdump_req(struct rtnl_handle *rth, int family) __attribute__((warn_unused_result)); int rtnl_addrlbldump_req(struct rtnl_handle *rth, int family) __attribute__((warn_unused_result)); -int rtnl_routedump_req(struct rtnl_handle *rth, int family) +int rtnl_routedump_req(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) __attribute__((warn_unused_result)); int rtnl_ruledump_req(struct rtnl_handle *rth, int family) __attribute__((warn_unused_result)); @@ -71,8 +74,6 @@ int rtnl_linkdump_req(struct rtnl_handle *rth, int fam) int rtnl_linkdump_req_filter(struct rtnl_handle *rth, int fam, __u32 filt_mask) __attribute__((warn_unused_result)); -typedef int (*req_filter_fn_t)(struct nlmsghdr *nlh, int reqlen); - int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int fam, req_filter_fn_t fn) __attribute__((warn_unused_result)); diff --git a/ip/ipmroute.c b/ip/ipmroute.c index 4d8867d3..de7a035f 100644 --- a/ip/ipmroute.c +++ b/ip/ipmroute.c @@ -283,7 +283,7 @@ static int mroute_list(int argc, char **argv) filter.iif = idx; } - if (rtnl_routedump_req(&rth, filter.af) < 0) { + if (rtnl_routedump_req(&rth, filter.af, NULL) < 0) { perror("Cannot send dump request"); return 1; } diff --git a/ip/iproute.c b/ip/iproute.c index d3472469..3c0be0a9 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -1535,24 +1535,6 @@ static int iproute_modify(int cmd, unsigned int flags, int argc, char **argv) return 0; } -static int rtnl_rtcache_request(struct rtnl_handle *rth, int family) -{ - struct { - struct nlmsghdr nlh; - struct rtmsg rtm; - } req = { - .nlh.nlmsg_len = sizeof(req), - .nlh.nlmsg_type = RTM_GETROUTE, - .nlh.nlmsg_flags = NLM_F_ROOT | NLM_F_REQUEST, - .nlh.nlmsg_seq = rth->dump = ++rth->seq, - .rtm.rtm_family = family, - .rtm.rtm_flags = RTM_F_CLONED, - }; - struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; - - return sendto(rth->fd, (void *)&req, sizeof(req), 0, (struct sockaddr *)&nladdr, sizeof(nladdr)); -} - static int iproute_flush_cache(void) { #define ROUTE_FLUSH_PATH "/proc/sys/net/ipv4/route/flush" @@ -1644,7 +1626,7 @@ static int iproute_flush(int do_ipv6, rtnl_filter_t filter_fn) filter.flushe = sizeof(flushb); for (;;) { - if (rtnl_routedump_req(&rth, do_ipv6) < 0) { + if (rtnl_routedump_req(&rth, do_ipv6, NULL) < 0) { perror("Cannot send dump request"); return -2; } @@ -1684,6 +1666,16 @@ static int iproute_flush(int do_ipv6, rtnl_filter_t filter_fn) } } +static int iproute_dump_filter(struct nlmsghdr *nlh, int reqlen) +{ + struct rtmsg *rtm = NLMSG_DATA(nlh); + + if (filter.cloned) + rtm->rtm_flags |= RTM_F_CLONED; + + return 0; +} + static int iproute_list_flush_or_save(int argc, char **argv, int action) { int do_ipv6 = preferred_family; @@ -1889,16 +1881,9 @@ static int iproute_list_flush_or_save(int argc, char **argv, int action) if (action == IPROUTE_FLUSH) return iproute_flush(do_ipv6, filter_fn); - if (!filter.cloned) { - if (rtnl_routedump_req(&rth, do_ipv6) < 0) { - perror("Cannot send dump request"); - return -2; - } - } else { - if (rtnl_rtcache_request(&rth, do_ipv6) < 0) { - perror("Cannot send dump request"); - return -2; - } + if (rtnl_routedump_req(&rth, do_ipv6, iproute_dump_filter) < 0) { + perror("Cannot send dump request"); + return -2; } new_json_obj(json); diff --git a/lib/libnetlink.c b/lib/libnetlink.c index b9c37fd3..56a1cade 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -266,11 +266,13 @@ int rtnl_addrlbldump_req(struct rtnl_handle *rth, int family) return send(rth->fd, &req, sizeof(req), 0); } -int rtnl_routedump_req(struct rtnl_handle *rth, int family) +int rtnl_routedump_req(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) { struct { struct nlmsghdr nlh; struct rtmsg rtm; + char buf[128]; } req = { .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), .nlh.nlmsg_type = RTM_GETROUTE, @@ -279,6 +281,14 @@ int rtnl_routedump_req(struct rtnl_handle *rth, int family) .rtm.rtm_family = family, }; + if (filter_fn) { + int err; + + err = filter_fn(&req.nlh, sizeof(req)); + if (err) + return err; + } + return send(rth->fd, &req, sizeof(req), 0); } From c7e6371bc4afe6d42700d3174a6c56cba5833844 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 19 Oct 2018 15:41:39 -0700 Subject: [PATCH 35/50] ip route: Add protocol, table id and device to dump request Add protocol, table id and device to dump request if set in filter. If kernel side filtering is supported it is used to reduce the amount of data sent to userspace. Older kernels do not parse attributes on a route dump request, so these are silently ignored and ip will do the filtering in userspace. Signed-off-by: David Ahern --- ip/iproute.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ip/iproute.c b/ip/iproute.c index 3c0be0a9..5bffb9d8 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -1669,10 +1669,24 @@ static int iproute_flush(int do_ipv6, rtnl_filter_t filter_fn) static int iproute_dump_filter(struct nlmsghdr *nlh, int reqlen) { struct rtmsg *rtm = NLMSG_DATA(nlh); + int err; + rtm->rtm_protocol = filter.protocol; if (filter.cloned) rtm->rtm_flags |= RTM_F_CLONED; + if (filter.tb) { + err = addattr32(nlh, reqlen, RTA_TABLE, filter.tb); + if (err) + return err; + } + + if (filter.oif) { + err = addattr32(nlh, reqlen, RTA_OIF, filter.oif); + if (err) + return err; + } + return 0; } From 98ce99273f24f74c34d39e887b9f16b6e1a36ffc Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 19 Dec 2018 13:11:15 -0800 Subject: [PATCH 36/50] mroute: fix up family handling Only ipv4 and ipv6 have multicast routing. Set family accordingly and just return for other cases. Signed-off-by: David Ahern --- ip/ipmroute.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ip/ipmroute.c b/ip/ipmroute.c index de7a035f..b8f0bc49 100644 --- a/ip/ipmroute.c +++ b/ip/ipmroute.c @@ -223,18 +223,20 @@ void ipmroute_reset_filter(int ifindex) static int mroute_list(int argc, char **argv) { char *id = NULL; - int family; + int family = preferred_family; ipmroute_reset_filter(0); - if (preferred_family == AF_UNSPEC) - family = AF_INET; - else - family = AF_INET6; - if (family == AF_INET) { + if (family == AF_INET || family == AF_UNSPEC) { + family = RTNL_FAMILY_IPMR; filter.af = RTNL_FAMILY_IPMR; filter.tb = RT_TABLE_DEFAULT; /* for backward compatibility */ - } else + } else if (family == AF_INET6) { + family = RTNL_FAMILY_IP6MR; filter.af = RTNL_FAMILY_IP6MR; + } else { + /* family does not have multicast routing */ + return 0; + } filter.msrc.family = filter.mdst.family = family; From e41ede893960df15958fd146d857fa2cdc4df50e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 15 Oct 2018 10:30:34 -0700 Subject: [PATCH 37/50] mroute: Add table id attribute for kernel side filtering Similar to 'ip route' add the table id to the dump request for kernel side filtering if it is supported. Signed-off-by: David Ahern --- ip/ipmroute.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/ip/ipmroute.c b/ip/ipmroute.c index b8f0bc49..b29c78e4 100644 --- a/ip/ipmroute.c +++ b/ip/ipmroute.c @@ -220,6 +220,19 @@ void ipmroute_reset_filter(int ifindex) filter.iif = ifindex; } +static int iproute_dump_filter(struct nlmsghdr *nlh, int reqlen) +{ + int err; + + if (filter.tb) { + err = addattr32(nlh, reqlen, RTA_TABLE, filter.tb); + if (err) + return err; + } + + return 0; +} + static int mroute_list(int argc, char **argv) { char *id = NULL; @@ -285,7 +298,7 @@ static int mroute_list(int argc, char **argv) filter.iif = idx; } - if (rtnl_routedump_req(&rth, filter.af, NULL) < 0) { + if (rtnl_routedump_req(&rth, filter.af, iproute_dump_filter) < 0) { perror("Cannot send dump request"); return 1; } From 7ca9cee8d88b99e7b4039346f9e829f1b20c718a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 19 Dec 2018 13:28:47 -0800 Subject: [PATCH 38/50] ip address: Split ip_linkaddr_list into link and addr functions Split ip_linkaddr_list into one function that generates a list of devices and a second that generates the list of addresses. Signed-off-by: David Ahern --- ip/ip_common.h | 3 +-- ip/ipaddress.c | 45 +++++++++++++++++++++++---------------------- ip/ipvrf.c | 2 +- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/ip/ip_common.h b/ip/ip_common.h index 53668f59..d67575c6 100644 --- a/ip/ip_common.h +++ b/ip/ip_common.h @@ -84,8 +84,7 @@ int do_seg6(int argc, char **argv); int iplink_get(char *name, __u32 filt_mask); int iplink_ifla_xstats(int argc, char **argv); -int ip_linkaddr_list(int family, req_filter_fn_t filter_fn, - struct nlmsg_chain *linfo, struct nlmsg_chain *ainfo); +int ip_link_list(req_filter_fn_t filter_fn, struct nlmsg_chain *linfo); void free_nlmsg_chain(struct nlmsg_chain *info); static inline int rtm_get_table(struct rtmsg *r, struct rtattr **tb) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 016662e9..746dbfc5 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -1766,8 +1766,7 @@ static int iplink_filter_req(struct nlmsghdr *nlh, int reqlen) * caller can walk lists as desired and must call free_nlmsg_chain for * both when done */ -int ip_linkaddr_list(int family, req_filter_fn_t filter_fn, - struct nlmsg_chain *linfo, struct nlmsg_chain *ainfo) +int ip_link_list(req_filter_fn_t filter_fn, struct nlmsg_chain *linfo) { if (rtnl_linkdump_req_filter_fn(&rth, preferred_family, filter_fn) < 0) { @@ -1780,16 +1779,19 @@ int ip_linkaddr_list(int family, req_filter_fn_t filter_fn, return 1; } - if (ainfo) { - if (rtnl_addrdump_req(&rth, family) < 0) { - perror("Cannot send dump request"); - return 1; - } + return 0; +} - if (rtnl_dump_filter(&rth, store_nlmsg, ainfo) < 0) { - fprintf(stderr, "Dump terminated\n"); - return 1; - } +static int ip_addr_list(struct nlmsg_chain *ainfo) +{ + if (rtnl_addrdump_req(&rth, filter.family) < 0) { + perror("Cannot send dump request"); + return 1; + } + + if (rtnl_dump_filter(&rth, store_nlmsg, ainfo) < 0) { + fprintf(stderr, "Dump terminated\n"); + return 1; } return 0; @@ -1798,7 +1800,7 @@ int ip_linkaddr_list(int family, req_filter_fn_t filter_fn, static int ipaddr_list_flush_or_save(int argc, char **argv, int action) { struct nlmsg_chain linfo = { NULL, NULL}; - struct nlmsg_chain _ainfo = { NULL, NULL}, *ainfo = NULL; + struct nlmsg_chain _ainfo = { NULL, NULL}, *ainfo = &_ainfo; struct nlmsg_list *l; char *filter_dev = NULL; int no_link = 0; @@ -1940,19 +1942,18 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action) goto out; } - if (filter.family != AF_PACKET) { - ainfo = &_ainfo; - - if (filter.oneline) - no_link = 1; - } - - if (ip_linkaddr_list(filter.family, iplink_filter_req, - &linfo, ainfo) != 0) + if (ip_link_list(iplink_filter_req, &linfo) != 0) goto out; - if (filter.family != AF_PACKET) + if (filter.family != AF_PACKET) { + if (filter.oneline) + no_link = 1; + + if (ip_addr_list(ainfo) != 0) + goto out; + ipaddr_filter(&linfo, ainfo); + } for (l = linfo.head; l; l = l->next) { struct nlmsghdr *n = &l->h; diff --git a/ip/ipvrf.c b/ip/ipvrf.c index 8a6b7f97..08a0d45b 100644 --- a/ip/ipvrf.c +++ b/ip/ipvrf.c @@ -589,7 +589,7 @@ static int ipvrf_show(int argc, char **argv) return 0; } - if (ip_linkaddr_list(0, ipvrf_filter_req, &linfo, NULL) == 0) { + if (ip_link_list(ipvrf_filter_req, &linfo) == 0) { struct nlmsg_list *l; unsigned nvrf = 0; int n; From 8847097850571f389ecb396bfabc359582a49fee Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 19 Dec 2018 13:30:44 -0800 Subject: [PATCH 39/50] ip address: Set device index in dump request Add a filter function to rtnl_addrdump_req to set device index in the address dump request if the user is filtering addresses by device. In addition, add a new ipaddr_link_get to do a single RTM_GETLINK request instead of a device dump yet still store the data in the linfo list. Signed-off-by: David Ahern --- include/libnetlink.h | 3 ++- ip/ipaddress.c | 59 ++++++++++++++++++++++++++++++++++++++------ lib/libnetlink.c | 12 ++++++++- 3 files changed, 65 insertions(+), 9 deletions(-) diff --git a/include/libnetlink.h b/include/libnetlink.h index b0051f39..2621bc99 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -49,7 +49,8 @@ void rtnl_close(struct rtnl_handle *rth); typedef int (*req_filter_fn_t)(struct nlmsghdr *nlh, int reqlen); -int rtnl_addrdump_req(struct rtnl_handle *rth, int family) +int rtnl_addrdump_req(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) __attribute__((warn_unused_result)); int rtnl_addrlbldump_req(struct rtnl_handle *rth, int family) __attribute__((warn_unused_result)); diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 746dbfc5..2bc33f3a 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -1679,6 +1679,15 @@ static void ipaddr_filter(struct nlmsg_chain *linfo, struct nlmsg_chain *ainfo) } } +static int ipaddr_dump_filter(struct nlmsghdr *nlh, int reqlen) +{ + struct ifaddrmsg *ifa = NLMSG_DATA(nlh); + + ifa->ifa_index = filter.ifindex; + + return 0; +} + static int ipaddr_flush(void) { int round = 0; @@ -1689,7 +1698,8 @@ static int ipaddr_flush(void) filter.flushe = sizeof(flushb); while ((max_flush_loops == 0) || (round < max_flush_loops)) { - if (rtnl_addrdump_req(&rth, filter.family) < 0) { + if (rtnl_addrdump_req(&rth, filter.family, + ipaddr_dump_filter) < 0) { perror("Cannot send dump request"); exit(1); } @@ -1762,6 +1772,36 @@ static int iplink_filter_req(struct nlmsghdr *nlh, int reqlen) return 0; } +static int ipaddr_link_get(int index, struct nlmsg_chain *linfo) +{ + struct iplink_req req = { + .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .n.nlmsg_flags = NLM_F_REQUEST, + .n.nlmsg_type = RTM_GETLINK, + .i.ifi_family = filter.family, + .i.ifi_index = index, + }; + __u32 filt_mask = RTEXT_FILTER_VF; + struct nlmsghdr *answer; + + if (!show_stats) + filt_mask |= RTEXT_FILTER_SKIP_STATS; + + addattr32(&req.n, sizeof(req), IFLA_EXT_MASK, filt_mask); + + if (rtnl_talk(&rth, &req.n, &answer) < 0) { + perror("Cannot send link request"); + return 1; + } + + if (store_nlmsg(answer, linfo) < 0) { + fprintf(stderr, "Failed to process link information\n"); + return 1; + } + + return 0; +} + /* fills in linfo with link data and optionally ainfo with address info * caller can walk lists as desired and must call free_nlmsg_chain for * both when done @@ -1784,7 +1824,7 @@ int ip_link_list(req_filter_fn_t filter_fn, struct nlmsg_chain *linfo) static int ip_addr_list(struct nlmsg_chain *ainfo) { - if (rtnl_addrdump_req(&rth, filter.family) < 0) { + if (rtnl_addrdump_req(&rth, filter.family, ipaddr_dump_filter) < 0) { perror("Cannot send dump request"); return 1; } @@ -1908,7 +1948,8 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action) if (ipadd_save_prep()) exit(1); - if (rtnl_addrdump_req(&rth, preferred_family) < 0) { + if (rtnl_addrdump_req(&rth, preferred_family, + ipaddr_dump_filter) < 0) { perror("Cannot send dump request"); exit(1); } @@ -1942,8 +1983,13 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action) goto out; } - if (ip_link_list(iplink_filter_req, &linfo) != 0) - goto out; + if (filter.ifindex) { + if (ipaddr_link_get(filter.ifindex, &linfo) != 0) + goto out; + } else { + if (ip_link_list(iplink_filter_req, &linfo) != 0) + goto out; + } if (filter.family != AF_PACKET) { if (filter.oneline) @@ -1972,8 +2018,7 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action) fflush(stdout); out: - if (ainfo) - free_nlmsg_chain(ainfo); + free_nlmsg_chain(ainfo); free_nlmsg_chain(&linfo); delete_json_obj(); return 0; diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 56a1cade..0ddd646a 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -234,11 +234,13 @@ int rtnl_open(struct rtnl_handle *rth, unsigned int subscriptions) return rtnl_open_byproto(rth, subscriptions, NETLINK_ROUTE); } -int rtnl_addrdump_req(struct rtnl_handle *rth, int family) +int rtnl_addrdump_req(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) { struct { struct nlmsghdr nlh; struct ifaddrmsg ifm; + char buf[128]; } req = { .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), .nlh.nlmsg_type = RTM_GETADDR, @@ -247,6 +249,14 @@ int rtnl_addrdump_req(struct rtnl_handle *rth, int family) .ifm.ifa_family = family, }; + if (filter_fn) { + int err; + + err = filter_fn(&req.nlh, sizeof(req)); + if (err) + return err; + } + return send(rth->fd, &req, sizeof(req), 0); } From aea41afcfd6d6547dd2e80ddde8df7e3b2800482 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Oct 2018 13:49:41 -0700 Subject: [PATCH 40/50] ip bridge: Set NETLINK_GET_STRICT_CHK on socket iproute2 has been updated for the new strict policy in the kernel. Add a helper to call setsockopt to enable the feature. Add a call to ip.c and bridge.c The setsockopt fails on older kernels and the error can be safely ignored - any new fields or attributes are ignored by the older kernel. Signed-off-by: David Ahern --- bridge/bridge.c | 4 ++++ include/libnetlink.h | 1 + ip/ip.c | 2 ++ lib/libnetlink.c | 9 +++++++++ 4 files changed, 16 insertions(+) diff --git a/bridge/bridge.c b/bridge/bridge.c index a3d8154b..a50d9d59 100644 --- a/bridge/bridge.c +++ b/bridge/bridge.c @@ -97,6 +97,8 @@ static int batch(const char *name) return EXIT_FAILURE; } + rtnl_set_strict_dump(&rth); + cmdlineno = 0; while (getcmdline(&line, &len, stdin) != -1) { char *largv[100]; @@ -205,6 +207,8 @@ main(int argc, char **argv) if (rtnl_open(&rth, 0) < 0) exit(1); + rtnl_set_strict_dump(&rth); + if (argc > 1) return do_cmd(argv[1], argc-1, argv+1); diff --git a/include/libnetlink.h b/include/libnetlink.h index 2621bc99..dc0c9c4e 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -46,6 +46,7 @@ int rtnl_open_byproto(struct rtnl_handle *rth, unsigned int subscriptions, __attribute__((warn_unused_result)); void rtnl_close(struct rtnl_handle *rth); +void rtnl_set_strict_dump(struct rtnl_handle *rth); typedef int (*req_filter_fn_t)(struct nlmsghdr *nlh, int reqlen); diff --git a/ip/ip.c b/ip/ip.c index a5bbacb4..e4131714 100644 --- a/ip/ip.c +++ b/ip/ip.c @@ -308,6 +308,8 @@ int main(int argc, char **argv) if (rtnl_open(&rth, 0) < 0) exit(1); + rtnl_set_strict_dump(&rth); + if (strlen(basename) > 2) return do_cmd(basename+2, argc, argv); diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 0ddd646a..4d7d0810 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -161,6 +161,15 @@ static int nl_dump_ext_ack_done(const struct nlmsghdr *nlh, int error) } #endif +/* Older kernels may not support strict dump and filtering */ +void rtnl_set_strict_dump(struct rtnl_handle *rth) +{ + int one = 1; + + setsockopt(rth->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, + &one, sizeof(one)); +} + void rtnl_close(struct rtnl_handle *rth) { if (rth->fd >= 0) { From 8d4f35de1705b07fd686d93b0854a6b9052be3e6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 31 Oct 2018 12:29:45 -0700 Subject: [PATCH 41/50] ip route: Rename do_ipv6 to dump_family do_ipv6 is really the preferred dump family. Rename it to make that apparent. Signed-off-by: David Ahern --- ip/iproute.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/ip/iproute.c b/ip/iproute.c index 5bffb9d8..0440366e 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -1604,7 +1604,7 @@ static int save_route_prep(void) return 0; } -static int iproute_flush(int do_ipv6, rtnl_filter_t filter_fn) +static int iproute_flush(int family, rtnl_filter_t filter_fn) { time_t start = time(0); char flushb[4096-512]; @@ -1612,12 +1612,12 @@ static int iproute_flush(int do_ipv6, rtnl_filter_t filter_fn) int ret; if (filter.cloned) { - if (do_ipv6 != AF_INET6) { + if (family != AF_INET6) { iproute_flush_cache(); if (show_stats) printf("*** IPv4 routing cache is flushed.\n"); } - if (do_ipv6 == AF_INET) + if (family == AF_INET) return 0; } @@ -1626,7 +1626,7 @@ static int iproute_flush(int do_ipv6, rtnl_filter_t filter_fn) filter.flushe = sizeof(flushb); for (;;) { - if (rtnl_routedump_req(&rth, do_ipv6, NULL) < 0) { + if (rtnl_routedump_req(&rth, family, NULL) < 0) { perror("Cannot send dump request"); return -2; } @@ -1638,7 +1638,7 @@ static int iproute_flush(int do_ipv6, rtnl_filter_t filter_fn) if (filter.flushed == 0) { if (show_stats) { if (round == 0 && - (!filter.cloned || do_ipv6 == AF_INET6)) + (!filter.cloned || family == AF_INET6)) printf("Nothing to flush.\n"); else printf("*** Flush is complete after %d round%s ***\n", @@ -1692,7 +1692,7 @@ static int iproute_dump_filter(struct nlmsghdr *nlh, int reqlen) static int iproute_list_flush_or_save(int argc, char **argv, int action) { - int do_ipv6 = preferred_family; + int dump_family = preferred_family; char *id = NULL; char *od = NULL; unsigned int mark = 0; @@ -1811,13 +1811,13 @@ static int iproute_list_flush_or_save(int argc, char **argv, int action) NEXT_ARG(); family = read_family(*argv); if (family == AF_UNSPEC) - family = do_ipv6; + family = dump_family; else NEXT_ARG(); get_prefix(&filter.rvia, *argv, family); } else if (strcmp(*argv, "src") == 0) { NEXT_ARG(); - get_prefix(&filter.rprefsrc, *argv, do_ipv6); + get_prefix(&filter.rprefsrc, *argv, dump_family); } else if (matches(*argv, "realms") == 0) { __u32 realm; @@ -1837,15 +1837,15 @@ static int iproute_list_flush_or_save(int argc, char **argv, int action) NEXT_ARG(); if (matches(*argv, "root") == 0) { NEXT_ARG(); - get_prefix(&filter.rsrc, *argv, do_ipv6); + get_prefix(&filter.rsrc, *argv, dump_family); } else if (matches(*argv, "match") == 0) { NEXT_ARG(); - get_prefix(&filter.msrc, *argv, do_ipv6); + get_prefix(&filter.msrc, *argv, dump_family); } else { if (matches(*argv, "exact") == 0) { NEXT_ARG(); } - get_prefix(&filter.msrc, *argv, do_ipv6); + get_prefix(&filter.msrc, *argv, dump_family); filter.rsrc = filter.msrc; } } else { @@ -1854,23 +1854,23 @@ static int iproute_list_flush_or_save(int argc, char **argv, int action) } if (matches(*argv, "root") == 0) { NEXT_ARG(); - get_prefix(&filter.rdst, *argv, do_ipv6); + get_prefix(&filter.rdst, *argv, dump_family); } else if (matches(*argv, "match") == 0) { NEXT_ARG(); - get_prefix(&filter.mdst, *argv, do_ipv6); + get_prefix(&filter.mdst, *argv, dump_family); } else { if (matches(*argv, "exact") == 0) { NEXT_ARG(); } - get_prefix(&filter.mdst, *argv, do_ipv6); + get_prefix(&filter.mdst, *argv, dump_family); filter.rdst = filter.mdst; } } argc--; argv++; } - if (do_ipv6 == AF_UNSPEC && filter.tb) - do_ipv6 = AF_INET; + if (dump_family == AF_UNSPEC && filter.tb) + dump_family = AF_INET; if (id || od) { int idx; @@ -1893,9 +1893,9 @@ static int iproute_list_flush_or_save(int argc, char **argv, int action) filter.mark = mark; if (action == IPROUTE_FLUSH) - return iproute_flush(do_ipv6, filter_fn); + return iproute_flush(dump_family, filter_fn); - if (rtnl_routedump_req(&rth, do_ipv6, iproute_dump_filter) < 0) { + if (rtnl_routedump_req(&rth, dump_family, iproute_dump_filter) < 0) { perror("Cannot send dump request"); return -2; } From 6b83edc0610c40c353037ea80e8e630fe420dd00 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 19 Dec 2018 17:07:19 -0800 Subject: [PATCH 42/50] neighbor: Add support for protocol attribute Add support to set protocol on neigh entries and to print the protocol on dumps. Signed-off-by: David Ahern --- ip/ipneigh.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/ip/ipneigh.c b/ip/ipneigh.c index 6041c467..26ac2d1b 100644 --- a/ip/ipneigh.c +++ b/ip/ipneigh.c @@ -40,6 +40,7 @@ static struct int flushp; int flushe; int master; + int protocol; } filter; static void usage(void) __attribute__((noreturn)); @@ -48,7 +49,7 @@ static void usage(void) { fprintf(stderr, "Usage: ip neigh { add | del | change | replace }\n" " { ADDR [ lladdr LLADDR ] [ nud STATE ] | proxy ADDR } [ dev DEV ]\n"); - fprintf(stderr, " [ router ] [ extern_learn ]\n\n"); + fprintf(stderr, " [ router ] [ extern_learn ] [ protocol PROTO ]\n\n"); fprintf(stderr, " ip neigh { show | flush } [ proxy ] [ to PREFIX ] [ dev DEV ] [ nud STATE ]\n"); fprintf(stderr, " [ vrf NAME ]\n\n"); fprintf(stderr, "STATE := { permanent | noarp | stale | reachable | none |\n" @@ -148,6 +149,14 @@ static int ipneigh_modify(int cmd, int flags, int argc, char **argv) NEXT_ARG(); dev = *argv; dev_ok = 1; + } else if (matches(*argv, "protocol") == 0) { + __u32 proto; + + NEXT_ARG(); + if (rtnl_rtprot_a2n(&proto, *argv)) + invarg("\"protocol\" value is invalid\n", *argv); + if (addattr8(&req.n, sizeof(req), NDA_PROTOCOL, proto)) + return -1; } else { if (strcmp(*argv, "to") == 0) { NEXT_ARG(); @@ -244,6 +253,7 @@ int print_neigh(struct nlmsghdr *n, void *arg) int len = n->nlmsg_len; struct rtattr *tb[NDA_MAX+1]; static int logit = 1; + __u8 protocol = 0; if (n->nlmsg_type != RTM_NEWNEIGH && n->nlmsg_type != RTM_DELNEIGH && n->nlmsg_type != RTM_GETNEIGH) { @@ -285,6 +295,12 @@ int print_neigh(struct nlmsghdr *n, void *arg) if (inet_addr_match_rta(&filter.pfx, tb[NDA_DST])) return 0; + if (tb[NDA_PROTOCOL]) + protocol = rta_getattr_u8(tb[NDA_PROTOCOL]); + + if (filter.protocol && filter.protocol != protocol) + return 0; + if (filter.unused_only && tb[NDA_CACHEINFO]) { struct nda_cacheinfo *ci = RTA_DATA(tb[NDA_CACHEINFO]); @@ -371,6 +387,13 @@ int print_neigh(struct nlmsghdr *n, void *arg) if (r->ndm_state) print_neigh_state(r->ndm_state); + if (protocol) { + SPRINT_BUF(b1); + + print_string(PRINT_ANY, "protocol", " proto %s ", + rtnl_rtprot_n2a(protocol, b1, sizeof(b1))); + } + print_string(PRINT_FP, NULL, "\n", ""); close_json_object(); fflush(stdout); @@ -458,9 +481,19 @@ static int do_show_or_flush(int argc, char **argv, int flush) if (state == 0) state = 0x100; filter.state |= state; - } else if (strcmp(*argv, "proxy") == 0) + } else if (strcmp(*argv, "proxy") == 0) { req.ndm.ndm_flags = NTF_PROXY; - else { + } else if (matches(*argv, "protocol") == 0) { + __u32 prot; + + NEXT_ARG(); + if (rtnl_rtprot_a2n(&prot, *argv)) { + if (strcmp(*argv, "all")) + invarg("invalid \"protocol\"\n", *argv); + prot = 0; + } + filter.protocol = prot; + } else { if (strcmp(*argv, "to") == 0) { NEXT_ARG(); } From 40fc8c2cecdda1f64ad98314fa379adbb6cee458 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Sun, 23 Dec 2018 13:24:19 +0200 Subject: [PATCH 43/50] rdma: Add print of link CapabilityMask2 flags CapabilityMask2 is defined in IBTA spec as a member of PortInfo. Add translation to string of new CapabilityMask2 expansion of link caps. The flags are concatenated to current caps print as seen in this example printing EXT_INFO flag: root@server-22 $ rdma -d link 1/1: mlx5_0/1: subnet_prefix fe80:0000:0000:0000 lid 2 sm_lid 2 lmc 0 state ACTIVE physical_state LINK_UP caps: Signed-off-by: Michael Guralnik Signed-off-by: Leon Romanovsky Signed-off-by: David Ahern --- rdma/link.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/rdma/link.c b/rdma/link.c index 7a6d4b7e..c064be62 100644 --- a/rdma/link.c +++ b/rdma/link.c @@ -19,7 +19,7 @@ static int link_help(struct rd *rd) static const char *caps_to_str(uint32_t idx) { -#define RDMA_PORT_FLAGS(x) \ +#define RDMA_PORT_FLAGS_LOW(x) \ x(RESERVED, 0) \ x(SM, 1) \ x(NOTICE, 2) \ @@ -53,13 +53,39 @@ static const char *caps_to_str(uint32_t idx) x(MULT_FDB, 30) \ x(HIERARCHY_INFO, 31) - enum { RDMA_PORT_FLAGS(RDMA_BITMAP_ENUM) }; +#define RDMA_PORT_FLAGS_HIGH(x) \ + x(SET_NODE_DESC, 0) \ + x(EXT_INFO, 1) \ + x(VIRT, 2) \ + x(SWITCH_POR_STATE_TABLE, 3) \ + x(LINK_WIDTH_2X, 4) \ + x(LINK_SPEED_HDR, 5) + + /* + * Separation below is needed to allow compilation of rdmatool + * on 32bits systems. On such systems, C-enum is limited to be + * int and can't hold more than 32 bits. + */ + enum { RDMA_PORT_FLAGS_LOW(RDMA_BITMAP_ENUM) }; + enum { RDMA_PORT_FLAGS_HIGH(RDMA_BITMAP_ENUM) }; static const char * const - rdma_port_names[] = { RDMA_PORT_FLAGS(RDMA_BITMAP_NAMES) }; - #undef RDMA_PORT_FLAGS + rdma_port_names_low[] = { RDMA_PORT_FLAGS_LOW(RDMA_BITMAP_NAMES) }; + static const char * const + rdma_port_names_high[] = { RDMA_PORT_FLAGS_HIGH(RDMA_BITMAP_NAMES) }; + uint32_t high_idx; + #undef RDMA_PORT_FLAGS_LOW + #undef RDMA_PORT_FLAGS_HIGH - return rdma_port_names[idx]; + if (idx < ARRAY_SIZE(rdma_port_names_low) && rdma_port_names_low[idx]) + return rdma_port_names_low[idx]; + + high_idx = idx - ARRAY_SIZE(rdma_port_names_low); + if (high_idx < ARRAY_SIZE(rdma_port_names_high) && + rdma_port_names_high[high_idx]) + return rdma_port_names_high[high_idx]; + + return "UNKNOWN"; } static void link_print_caps(struct rd *rd, struct nlattr **tb) From 66e8e73edc659cf09653487db2cb7eb4be25b575 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 30 Dec 2018 17:14:54 +0000 Subject: [PATCH 44/50] bridge: fdb: Use 'struct ndmsg' for FDB dumping Since commit aea41afcfd6d ("ip bridge: Set NETLINK_GET_STRICT_CHK on socket") iproute2 uses strict checking on kernels that support it. This causes FDB dumping to fail [1], as iproute2 uses 'struct ifinfomsg' whereas the kernel expects 'struct ndmsg'. Note that with this change iproute2 continues to work on old kernels that do not support strict checking, but contain the fix introduced in kernel commit bd961c9bc664 ("rtnetlink: fix rtnl_fdb_dump() for ndmsg header"). [1] # bridge fdb show [ 5365.137224] netlink: 4 bytes leftover after parsing attributes in process `bridge'. Error: bytes leftover after parsing attributes. Dump terminated Fixes: aea41afcfd6d ("ip bridge: Set NETLINK_GET_STRICT_CHK on socket") Signed-off-by: Ido Schimmel Signed-off-by: David Ahern --- bridge/fdb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bridge/fdb.c b/bridge/fdb.c index a5abc1b6..a7a0d805 100644 --- a/bridge/fdb.c +++ b/bridge/fdb.c @@ -260,16 +260,16 @@ static int fdb_show(int argc, char **argv) { struct { struct nlmsghdr n; - struct ifinfomsg ifm; + struct ndmsg ndm; char buf[256]; } req = { - .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), - .ifm.ifi_family = PF_BRIDGE, + .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), + .ndm.ndm_family = PF_BRIDGE, }; char *filter_dev = NULL; char *br = NULL; - int msg_size = sizeof(struct ifinfomsg); + int msg_size = sizeof(struct ndmsg); while (argc > 0) { if ((strcmp(*argv, "brport") == 0) || strcmp(*argv, "dev") == 0) { @@ -313,10 +313,10 @@ static int fdb_show(int argc, char **argv) filter_index = ll_name_to_index(filter_dev); if (!filter_index) return nodev(filter_dev); - req.ifm.ifi_index = filter_index; + req.ndm.ndm_ifindex = filter_index; } - if (rtnl_dump_request(&rth, RTM_GETNEIGH, &req.ifm, msg_size) < 0) { + if (rtnl_dump_request(&rth, RTM_GETNEIGH, &req.ndm, msg_size) < 0) { perror("Cannot send dump request"); exit(1); } From f255ab122537992e522abd29333084ca434b0a61 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 31 Dec 2018 09:54:47 -0800 Subject: [PATCH 45/50] libnetlink: Add filter function to rtnl_neighdump_req Add filter function to rtnl_neighdump_req and a buffer to the request for the filter functions to append attributes. Signed-off-by: David Ahern --- include/libnetlink.h | 3 ++- lib/libnetlink.c | 12 +++++++++++- misc/arpd.c | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/libnetlink.h b/include/libnetlink.h index dc0c9c4e..14895151 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -60,7 +60,8 @@ int rtnl_routedump_req(struct rtnl_handle *rth, int family, __attribute__((warn_unused_result)); int rtnl_ruledump_req(struct rtnl_handle *rth, int family) __attribute__((warn_unused_result)); -int rtnl_neighdump_req(struct rtnl_handle *rth, int family) +int rtnl_neighdump_req(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) __attribute__((warn_unused_result)); int rtnl_neightbldump_req(struct rtnl_handle *rth, int family) __attribute__((warn_unused_result)); diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 4d7d0810..19318b44 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -327,11 +327,13 @@ int rtnl_ruledump_req(struct rtnl_handle *rth, int family) return send(rth->fd, &req, sizeof(req), 0); } -int rtnl_neighdump_req(struct rtnl_handle *rth, int family) +int rtnl_neighdump_req(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) { struct { struct nlmsghdr nlh; struct ndmsg ndm; + char buf[256]; } req = { .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), .nlh.nlmsg_type = RTM_GETNEIGH, @@ -340,6 +342,14 @@ int rtnl_neighdump_req(struct rtnl_handle *rth, int family) .ndm.ndm_family = family, }; + if (filter_fn) { + int err; + + err = filter_fn(&req.nlh, sizeof(req)); + if (err) + return err; + } + return send(rth->fd, &req, sizeof(req), 0); } diff --git a/misc/arpd.c b/misc/arpd.c index ce7c0997..504961cb 100644 --- a/misc/arpd.c +++ b/misc/arpd.c @@ -424,7 +424,7 @@ static int do_one_request(struct nlmsghdr *n) static void load_initial_table(void) { - if (rtnl_neighdump_req(&rth, AF_INET) < 0) { + if (rtnl_neighdump_req(&rth, AF_INET, NULL) < 0) { perror("dump request failed"); exit(1); } From 101ec10a768a55d8bef9f290b65788b6f16a09eb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 31 Dec 2018 09:55:45 -0800 Subject: [PATCH 46/50] ip neigh: Convert do_show_or_flush to use rtnl_neighdump_req Add ipneigh_dump_filter to add filter attributes to the neighbor dump request and update do_show_or_flush to use rtnl_neighdump_req. Signed-off-by: David Ahern --- ip/ipneigh.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/ip/ipneigh.c b/ip/ipneigh.c index 26ac2d1b..2d717d2d 100644 --- a/ip/ipneigh.c +++ b/ip/ipneigh.c @@ -41,6 +41,7 @@ static struct int flushe; int master; int protocol; + __u8 ndm_flags; } filter; static void usage(void) __attribute__((noreturn)); @@ -408,16 +409,29 @@ void ipneigh_reset_filter(int ifindex) filter.index = ifindex; } +static int ipneigh_dump_filter(struct nlmsghdr *nlh, int reqlen) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + int err; + + ndm->ndm_flags = filter.ndm_flags; + + if (filter.index) { + err = addattr32(nlh, reqlen, NDA_IFINDEX, filter.index); + if (err) + return err; + } + if (filter.master) { + err = addattr32(nlh, reqlen, NDA_MASTER, filter.master); + if (err) + return err; + } + + return 0; +} + static int do_show_or_flush(int argc, char **argv, int flush) { - struct { - struct nlmsghdr n; - struct ndmsg ndm; - char buf[256]; - } req = { - .n.nlmsg_type = RTM_GETNEIGH, - .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), - }; char *filter_dev = NULL; int state_given = 0; @@ -448,7 +462,6 @@ static int do_show_or_flush(int argc, char **argv, int flush) ifindex = ll_name_to_index(*argv); if (!ifindex) invarg("Device does not exist\n", *argv); - addattr32(&req.n, sizeof(req), NDA_MASTER, ifindex); filter.master = ifindex; } else if (strcmp(*argv, "vrf") == 0) { int ifindex; @@ -459,7 +472,6 @@ static int do_show_or_flush(int argc, char **argv, int flush) invarg("Not a valid VRF name\n", *argv); if (!name_is_vrf(*argv)) invarg("Not a valid VRF name\n", *argv); - addattr32(&req.n, sizeof(req), NDA_MASTER, ifindex); filter.master = ifindex; } else if (strcmp(*argv, "unused") == 0) { filter.unused_only = 1; @@ -482,7 +494,7 @@ static int do_show_or_flush(int argc, char **argv, int flush) state = 0x100; filter.state |= state; } else if (strcmp(*argv, "proxy") == 0) { - req.ndm.ndm_flags = NTF_PROXY; + filter.ndm_flags = NTF_PROXY; } else if (matches(*argv, "protocol") == 0) { __u32 prot; @@ -513,11 +525,8 @@ static int do_show_or_flush(int argc, char **argv, int flush) filter.index = ll_name_to_index(filter_dev); if (!filter.index) return nodev(filter_dev); - addattr32(&req.n, sizeof(req), NDA_IFINDEX, filter.index); } - req.ndm.ndm_family = filter.family; - if (flush) { int round = 0; char flushb[4096-512]; @@ -527,7 +536,8 @@ static int do_show_or_flush(int argc, char **argv, int flush) filter.flushe = sizeof(flushb); while (round < MAX_ROUNDS) { - if (rtnl_dump_request_n(&rth, &req.n) < 0) { + if (rtnl_neighdump_req(&rth, filter.family, + ipneigh_dump_filter) < 0) { perror("Cannot send dump request"); exit(1); } @@ -560,7 +570,7 @@ static int do_show_or_flush(int argc, char **argv, int flush) return 1; } - if (rtnl_dump_request_n(&rth, &req.n) < 0) { + if (rtnl_neighdump_req(&rth, filter.family, ipneigh_dump_filter) < 0) { perror("Cannot send dump request"); exit(1); } From 66b4199f22d41ee3c128e7795a046d72e74433c6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 31 Dec 2018 10:00:24 -0800 Subject: [PATCH 47/50] bridge: Update fdb show to use rtnl_neighdump_req Add fdb_dump_filter to set filter attributes in dump request and convert fdb_show to use rtnl_neighdump_req. Signed-off-by: David Ahern --- bridge/fdb.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/bridge/fdb.c b/bridge/fdb.c index a7a0d805..9b98fdf8 100644 --- a/bridge/fdb.c +++ b/bridge/fdb.c @@ -30,7 +30,7 @@ #include "rt_names.h" #include "utils.h" -static unsigned int filter_index, filter_vlan, filter_state; +static unsigned int filter_index, filter_vlan, filter_state, filter_master; static void usage(void) { @@ -256,20 +256,29 @@ int print_fdb(struct nlmsghdr *n, void *arg) return 0; } +static int fdb_dump_filter(struct nlmsghdr *nlh, int reqlen) +{ + int err; + + if (filter_index) { + struct ndmsg *ndm = NLMSG_DATA(nlh); + + ndm->ndm_ifindex = filter_index; + } + + if (filter_master) { + err = addattr32(nlh, reqlen, NDA_MASTER, filter_master); + if (err) + return err; + } + + return 0; +} + static int fdb_show(int argc, char **argv) { - struct { - struct nlmsghdr n; - struct ndmsg ndm; - char buf[256]; - } req = { - .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), - .ndm.ndm_family = PF_BRIDGE, - }; - char *filter_dev = NULL; char *br = NULL; - int msg_size = sizeof(struct ndmsg); while (argc > 0) { if ((strcmp(*argv, "brport") == 0) || strcmp(*argv, "dev") == 0) { @@ -304,8 +313,7 @@ static int fdb_show(int argc, char **argv) fprintf(stderr, "Cannot find bridge device \"%s\"\n", br); return -1; } - addattr32(&req.n, sizeof(req), IFLA_MASTER, br_ifindex); - msg_size += RTA_LENGTH(4); + filter_master = br_ifindex; } /*we'll keep around filter_dev for older kernels */ @@ -313,10 +321,9 @@ static int fdb_show(int argc, char **argv) filter_index = ll_name_to_index(filter_dev); if (!filter_index) return nodev(filter_dev); - req.ndm.ndm_ifindex = filter_index; } - if (rtnl_dump_request(&rth, RTM_GETNEIGH, &req.ndm, msg_size) < 0) { + if (rtnl_neighdump_req(&rth, PF_BRIDGE, fdb_dump_filter) < 0) { perror("Cannot send dump request"); exit(1); } From 285033bfebbd287e41176611ac7c95c4c3d84eff Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 2 Jan 2019 16:31:38 -0800 Subject: [PATCH 48/50] libnetlink: Add RTNL_HANDLE_F_STRICT_CHK flag Add RTNL_HANDLE_F_STRICT_CHK flag and set in rth flags to let know commands know if the kernel supports strict checking. Extracted from patch from Ido to fix filtering with strict checking enabled. Cc: Ido Schimmel Signed-off-by: David Ahern --- include/libnetlink.h | 1 + lib/libnetlink.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/libnetlink.h b/include/libnetlink.h index 14895151..0854d6ad 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -23,6 +23,7 @@ struct rtnl_handle { FILE *dump_fp; #define RTNL_HANDLE_F_LISTEN_ALL_NSID 0x01 #define RTNL_HANDLE_F_SUPPRESS_NLERR 0x02 +#define RTNL_HANDLE_F_STRICT_CHK 0x04 int flags; }; diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 19318b44..98cb9d94 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -166,8 +166,11 @@ void rtnl_set_strict_dump(struct rtnl_handle *rth) { int one = 1; - setsockopt(rth->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, - &one, sizeof(one)); + if (setsockopt(rth->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, + &one, sizeof(one)) < 0) + return; + + rth->flags |= RTNL_HANDLE_F_STRICT_CHK; } void rtnl_close(struct rtnl_handle *rth) From 05880354c2cf37579fd4cd2f1d95a6b848f5dacc Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 2 Jan 2019 16:33:42 -0800 Subject: [PATCH 49/50] bridge: fdb: Fix filtering with strict checking disabled Older kernels expect an ifinfomsg struct as the ancillary header, and after kernel commit bd961c9bc664 ("rtnetlink: fix rtnl_fdb_dump() for ndmsg header") can handle either ifinfomsg or ndmsg. Strict data checking only allows ndmsg. Use the new RTNL_HANDLE_F_STRICT_CHK flag to know which header to send. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel --- bridge/fdb.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/bridge/fdb.c b/bridge/fdb.c index 9b98fdf8..f75e953a 100644 --- a/bridge/fdb.c +++ b/bridge/fdb.c @@ -256,6 +256,25 @@ int print_fdb(struct nlmsghdr *n, void *arg) return 0; } +static int fdb_linkdump_filter(struct nlmsghdr *nlh, int reqlen) +{ + int err; + + if (filter_index) { + struct ifinfomsg *ifm = NLMSG_DATA(nlh); + + ifm->ifi_index = filter_index; + } + + if (filter_master) { + err = addattr32(nlh, reqlen, IFLA_MASTER, filter_master); + if (err) + return err; + } + + return 0; +} + static int fdb_dump_filter(struct nlmsghdr *nlh, int reqlen) { int err; @@ -279,6 +298,7 @@ static int fdb_show(int argc, char **argv) { char *filter_dev = NULL; char *br = NULL; + int rc; while (argc > 0) { if ((strcmp(*argv, "brport") == 0) || strcmp(*argv, "dev") == 0) { @@ -323,7 +343,12 @@ static int fdb_show(int argc, char **argv) return nodev(filter_dev); } - if (rtnl_neighdump_req(&rth, PF_BRIDGE, fdb_dump_filter) < 0) { + if (rth.flags & RTNL_HANDLE_F_STRICT_CHK) + rc = rtnl_neighdump_req(&rth, PF_BRIDGE, fdb_dump_filter); + else + rc = rtnl_linkdump_req_filter_fn(&rth, PF_BRIDGE, + fdb_linkdump_filter); + if (rc < 0) { perror("Cannot send dump request"); exit(1); } From 97b44d571df3cff9b828fab5811f1ce0b8504fde Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 6 Jan 2019 16:17:13 -0800 Subject: [PATCH 50/50] libnetlink: linkdump_req is done for AF_BRIDGE as well The bridge command 'vlan show' calls rtnl_linkdump_req_filter for family AF_BRIDGE. Update rtnl_linkdump_req_filter to send the filter for that family as well. Fixes: d97b16b2c906 ("libnetlink: linkdump_req: Only AF_UNSPEC family expects an ext_filter_mask") Reported-by: Ido Schimmel Signed-off-by: David Ahern Tested-by: Ido Schimmel --- lib/libnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 98cb9d94..110f47bc 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -449,7 +449,7 @@ int rtnl_linkdump_req(struct rtnl_handle *rth, int family) int rtnl_linkdump_req_filter(struct rtnl_handle *rth, int family, __u32 filt_mask) { - if (family == AF_UNSPEC) { + if (family == AF_UNSPEC || family == AF_BRIDGE) { struct { struct nlmsghdr nlh; struct ifinfomsg ifm;