diff --git a/Makefile b/Makefile index 20c760e2..a513cf38 100644 --- a/Makefile +++ b/Makefile @@ -40,12 +40,6 @@ DEFINES+=-DCONFDIR=\"$(CONFDIR)\" \ -DNETNS_RUN_DIR=\"$(NETNS_RUN_DIR)\" \ -DNETNS_ETC_DIR=\"$(NETNS_ETC_DIR)\" -#options for decnet -ADDLIB+=dnet_ntop.o dnet_pton.o - -#options for ipx -ADDLIB+=ipx_ntop.o ipx_pton.o - #options for mpls ADDLIB+=mpls_ntop.o mpls_pton.o diff --git a/README.decnet b/README.decnet deleted file mode 100644 index 4300f906..00000000 --- a/README.decnet +++ /dev/null @@ -1,33 +0,0 @@ - -Here are a few quick points about DECnet support... - - o iproute2 is the tool of choice for configuring the DECnet support for - Linux. For many features, it is the only tool which can be used to - configure them. - - o No name resolution is available as yet, all addresses must be - entered numerically. - - o Remember to set the hardware address of the interface using: - - ip link set ethX address xx:xx:xx:xx:xx:xx - (where xx:xx:xx:xx:xx:xx is the MAC address for your DECnet node - address) - - if your Ethernet card won't listen to more than one unicast - mac address at once. If the Linux DECnet stack doesn't talk to - any other DECnet nodes, then check this with tcpdump and if its - a problem, change the mac address (but do this _before_ starting - any other network protocol on the interface) - - o Whilst you can use ip addr add to add more than one DECnet address to an - interface, don't expect addresses which are not the same as the - kernels node address to work properly with 2.4 kernels. This should - be fine with 2.6 kernels as the routing code has been extensively - modified and improved. - - o The DECnet support is currently self contained. It does not depend on - the libdnet library. - -Steve Whitehouse - diff --git a/README.iproute2+tc b/README.iproute2+tc index 5979098e..e7bb48ce 100644 --- a/README.iproute2+tc +++ b/README.iproute2+tc @@ -42,7 +42,7 @@ in rsvp/cbqinit.eth1. Terminology and advices about setting CBQ parameters may be found in Sally Floyd -papers. +papers. Pairs X:Y are class handles, X:0 are qdisc handles. diff --git a/README.lnstat b/README.lnstat index 057925f6..eab4088f 100644 --- a/README.lnstat +++ b/README.lnstat @@ -7,9 +7,9 @@ This tool is a generalized and more feature-complete replacement for the old In addition to routing cache statistics, it supports any kind of statistics the linux kernel exports via a file in /proc/net/stat. In a stock 2.6.9 -kernel, this is - per-protocol neighbour cache statistics - (ipv4, ipv6, atm, decnet) +kernel, this is + per-protocol neighbour cache statistics + (ipv4, ipv6, atm) routing cache statistics (ipv4) connection tracking statistics @@ -29,7 +29,7 @@ In order to get a list of supported statistics files, you can run lnstat -d It will display something like - + /proc/net/stat/arp_cache: 1: entries 2: allocs @@ -52,19 +52,19 @@ arp_cach|rt_cache|arp_cach| You can specify the interval (e.g. 10 seconds) by: - + lnstat -i 10 You can specify to only use one particular statistics file: lnstat -f ip_conntrack -You can specify individual field widths +You can specify individual field widths lnstat -k arp_cache:entries,rt_cache:entries -w 20,8 You can specify not to print a header at all - + lnstat -s 0 You can specify to print a header only at start of the program @@ -76,6 +76,5 @@ You can specify to print a header at start and every 20 lines: lnstat -s 20 You can specify the number of samples you want to take (e.g. 5): - - lnstat -c 5 + lnstat -c 5 diff --git a/bash-completion/tc b/bash-completion/tc index 29bca5d9..007e1c2e 100644 --- a/bash-completion/tc +++ b/bash-completion/tc @@ -302,7 +302,7 @@ _tc_qdisc_options() ;; gred) _tc_once_attr 'setup vqs default grio vq prio limit min max avpkt \ - burst probability bandwidth' + burst probability bandwidth ecn harddrop' return 0 ;; hhf) diff --git a/bridge/bridge.c b/bridge/bridge.c index a3d8154b..a50d9d59 100644 --- a/bridge/bridge.c +++ b/bridge/bridge.c @@ -97,6 +97,8 @@ static int batch(const char *name) return EXIT_FAILURE; } + rtnl_set_strict_dump(&rth); + cmdlineno = 0; while (getcmdline(&line, &len, stdin) != -1) { char *largv[100]; @@ -205,6 +207,8 @@ main(int argc, char **argv) if (rtnl_open(&rth, 0) < 0) exit(1); + rtnl_set_strict_dump(&rth); + if (argc > 1) return do_cmd(argv[1], argc-1, argv+1); diff --git a/bridge/fdb.c b/bridge/fdb.c index a5abc1b6..f75e953a 100644 --- a/bridge/fdb.c +++ b/bridge/fdb.c @@ -30,7 +30,7 @@ #include "rt_names.h" #include "utils.h" -static unsigned int filter_index, filter_vlan, filter_state; +static unsigned int filter_index, filter_vlan, filter_state, filter_master; static void usage(void) { @@ -256,20 +256,49 @@ int print_fdb(struct nlmsghdr *n, void *arg) return 0; } +static int fdb_linkdump_filter(struct nlmsghdr *nlh, int reqlen) +{ + int err; + + if (filter_index) { + struct ifinfomsg *ifm = NLMSG_DATA(nlh); + + ifm->ifi_index = filter_index; + } + + if (filter_master) { + err = addattr32(nlh, reqlen, IFLA_MASTER, filter_master); + if (err) + return err; + } + + return 0; +} + +static int fdb_dump_filter(struct nlmsghdr *nlh, int reqlen) +{ + int err; + + if (filter_index) { + struct ndmsg *ndm = NLMSG_DATA(nlh); + + ndm->ndm_ifindex = filter_index; + } + + if (filter_master) { + err = addattr32(nlh, reqlen, NDA_MASTER, filter_master); + if (err) + return err; + } + + return 0; +} + static int fdb_show(int argc, char **argv) { - struct { - struct nlmsghdr n; - struct ifinfomsg ifm; - char buf[256]; - } req = { - .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), - .ifm.ifi_family = PF_BRIDGE, - }; - char *filter_dev = NULL; char *br = NULL; - int msg_size = sizeof(struct ifinfomsg); + int rc; while (argc > 0) { if ((strcmp(*argv, "brport") == 0) || strcmp(*argv, "dev") == 0) { @@ -304,8 +333,7 @@ static int fdb_show(int argc, char **argv) fprintf(stderr, "Cannot find bridge device \"%s\"\n", br); return -1; } - addattr32(&req.n, sizeof(req), IFLA_MASTER, br_ifindex); - msg_size += RTA_LENGTH(4); + filter_master = br_ifindex; } /*we'll keep around filter_dev for older kernels */ @@ -313,10 +341,14 @@ static int fdb_show(int argc, char **argv) filter_index = ll_name_to_index(filter_dev); if (!filter_index) return nodev(filter_dev); - req.ifm.ifi_index = filter_index; } - if (rtnl_dump_request(&rth, RTM_GETNEIGH, &req.ifm, msg_size) < 0) { + if (rth.flags & RTNL_HANDLE_F_STRICT_CHK) + rc = rtnl_neighdump_req(&rth, PF_BRIDGE, fdb_dump_filter); + else + rc = rtnl_linkdump_req_filter_fn(&rth, PF_BRIDGE, + fdb_linkdump_filter); + if (rc < 0) { perror("Cannot send dump request"); exit(1); } diff --git a/devlink/devlink.c b/devlink/devlink.c index 8bb254ea..3651e90c 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -1920,10 +1920,80 @@ static int cmd_dev_eswitch(struct dl *dl) return -ENOENT; } -static void pr_out_param_value(struct dl *dl, int nla_type, struct nlattr *nl) +struct param_val_conv { + const char *name; + const char *vstr; + uint32_t vuint; +}; + +static bool param_val_conv_exists(const struct param_val_conv *param_val_conv, + uint32_t len, const char *name) +{ + uint32_t i; + + for (i = 0; i < len; i++) + if (!strcmp(param_val_conv[i].name, name)) + return true; + + return false; +} + +static int +param_val_conv_uint_get(const struct param_val_conv *param_val_conv, + uint32_t len, const char *name, const char *vstr, + uint32_t *vuint) +{ + uint32_t i; + + for (i = 0; i < len; i++) + if (!strcmp(param_val_conv[i].name, name) && + !strcmp(param_val_conv[i].vstr, vstr)) { + *vuint = param_val_conv[i].vuint; + return 0; + } + + return -ENOENT; +} + +static int +param_val_conv_str_get(const struct param_val_conv *param_val_conv, + uint32_t len, const char *name, uint32_t vuint, + const char **vstr) +{ + uint32_t i; + + for (i = 0; i < len; i++) + if (!strcmp(param_val_conv[i].name, name) && + param_val_conv[i].vuint == vuint) { + *vstr = param_val_conv[i].vstr; + return 0; + } + + return -ENOENT; +} + +static const struct param_val_conv param_val_conv[] = { + { + .name = "fw_load_policy", + .vstr = "driver", + .vuint = DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER, + }, + { + .name = "fw_load_policy", + .vstr = "flash", + .vuint = DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH, + }, +}; + +#define PARAM_VAL_CONV_LEN ARRAY_SIZE(param_val_conv) + +static void pr_out_param_value(struct dl *dl, const char *nla_name, + int nla_type, struct nlattr *nl) { struct nlattr *nla_value[DEVLINK_ATTR_MAX + 1] = {}; struct nlattr *val_attr; + const char *vstr; + bool conv_exists; int err; err = mnl_attr_parse_nested(nl, attr_cb, nla_value); @@ -1939,15 +2009,51 @@ static void pr_out_param_value(struct dl *dl, int nla_type, struct nlattr *nl) param_cmode_name(mnl_attr_get_u8(nla_value[DEVLINK_ATTR_PARAM_VALUE_CMODE]))); val_attr = nla_value[DEVLINK_ATTR_PARAM_VALUE_DATA]; + conv_exists = param_val_conv_exists(param_val_conv, PARAM_VAL_CONV_LEN, + nla_name); + switch (nla_type) { case MNL_TYPE_U8: - pr_out_uint(dl, "value", mnl_attr_get_u8(val_attr)); + if (conv_exists) { + err = param_val_conv_str_get(param_val_conv, + PARAM_VAL_CONV_LEN, + nla_name, + mnl_attr_get_u8(val_attr), + &vstr); + if (err) + return; + pr_out_str(dl, "value", vstr); + } else { + pr_out_uint(dl, "value", mnl_attr_get_u8(val_attr)); + } break; case MNL_TYPE_U16: - pr_out_uint(dl, "value", mnl_attr_get_u16(val_attr)); + if (conv_exists) { + err = param_val_conv_str_get(param_val_conv, + PARAM_VAL_CONV_LEN, + nla_name, + mnl_attr_get_u16(val_attr), + &vstr); + if (err) + return; + pr_out_str(dl, "value", vstr); + } else { + pr_out_uint(dl, "value", mnl_attr_get_u16(val_attr)); + } break; case MNL_TYPE_U32: - pr_out_uint(dl, "value", mnl_attr_get_u32(val_attr)); + if (conv_exists) { + err = param_val_conv_str_get(param_val_conv, + PARAM_VAL_CONV_LEN, + nla_name, + mnl_attr_get_u32(val_attr), + &vstr); + if (err) + return; + pr_out_str(dl, "value", vstr); + } else { + pr_out_uint(dl, "value", mnl_attr_get_u32(val_attr)); + } break; case MNL_TYPE_STRING: pr_out_str(dl, "value", mnl_attr_get_str(val_attr)); @@ -1962,6 +2068,7 @@ static void pr_out_param(struct dl *dl, struct nlattr **tb, bool array) { struct nlattr *nla_param[DEVLINK_ATTR_MAX + 1] = {}; struct nlattr *param_value_attr; + const char *nla_name; int nla_type; int err; @@ -1980,8 +2087,8 @@ static void pr_out_param(struct dl *dl, struct nlattr **tb, bool array) nla_type = mnl_attr_get_u8(nla_param[DEVLINK_ATTR_PARAM_TYPE]); - pr_out_str(dl, "name", - mnl_attr_get_str(nla_param[DEVLINK_ATTR_PARAM_NAME])); + nla_name = mnl_attr_get_str(nla_param[DEVLINK_ATTR_PARAM_NAME]); + pr_out_str(dl, "name", nla_name); if (!nla_param[DEVLINK_ATTR_PARAM_GENERIC]) pr_out_str(dl, "type", "driver-specific"); @@ -1992,7 +2099,7 @@ static void pr_out_param(struct dl *dl, struct nlattr **tb, bool array) mnl_attr_for_each_nested(param_value_attr, nla_param[DEVLINK_ATTR_PARAM_VALUES_LIST]) { pr_out_entry_start(dl); - pr_out_param_value(dl, nla_type, param_value_attr); + pr_out_param_value(dl, nla_name, nla_type, param_value_attr); pr_out_entry_end(dl); } pr_out_array_end(dl); @@ -2097,6 +2204,7 @@ static int cmd_dev_param_set(struct dl *dl) { struct param_ctx ctx = {}; struct nlmsghdr *nlh; + bool conv_exists; uint32_t val_u32; uint16_t val_u16; uint8_t val_u8; @@ -2124,10 +2232,22 @@ static int cmd_dev_param_set(struct dl *dl) NLM_F_REQUEST | NLM_F_ACK); dl_opts_put(nlh, dl); + conv_exists = param_val_conv_exists(param_val_conv, PARAM_VAL_CONV_LEN, + dl->opts.param_name); + mnl_attr_put_u8(nlh, DEVLINK_ATTR_PARAM_TYPE, ctx.nla_type); switch (ctx.nla_type) { case MNL_TYPE_U8: - err = strtouint8_t(dl->opts.param_value, &val_u8); + if (conv_exists) { + err = param_val_conv_uint_get(param_val_conv, + PARAM_VAL_CONV_LEN, + dl->opts.param_name, + dl->opts.param_value, + &val_u32); + val_u8 = val_u32; + } else { + err = strtouint8_t(dl->opts.param_value, &val_u8); + } if (err) goto err_param_value_parse; if (val_u8 == ctx.value.vu8) @@ -2135,7 +2255,16 @@ static int cmd_dev_param_set(struct dl *dl) mnl_attr_put_u8(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, val_u8); break; case MNL_TYPE_U16: - err = strtouint16_t(dl->opts.param_value, &val_u16); + if (conv_exists) { + err = param_val_conv_uint_get(param_val_conv, + PARAM_VAL_CONV_LEN, + dl->opts.param_name, + dl->opts.param_value, + &val_u32); + val_u16 = val_u32; + } else { + err = strtouint16_t(dl->opts.param_value, &val_u16); + } if (err) goto err_param_value_parse; if (val_u16 == ctx.value.vu16) @@ -2143,7 +2272,14 @@ static int cmd_dev_param_set(struct dl *dl) mnl_attr_put_u16(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, val_u16); break; case MNL_TYPE_U32: - err = strtouint32_t(dl->opts.param_value, &val_u32); + if (conv_exists) + err = param_val_conv_uint_get(param_val_conv, + PARAM_VAL_CONV_LEN, + dl->opts.param_name, + dl->opts.param_value, + &val_u32); + else + err = strtouint32_t(dl->opts.param_value, &val_u32); if (err) goto err_param_value_parse; if (val_u32 == ctx.value.vu32) diff --git a/doc/actions/actions-general b/doc/actions/actions-general index 08cc785c..407a514c 100644 --- a/doc/actions/actions-general +++ b/doc/actions/actions-general @@ -6,8 +6,8 @@ What is it? ----------- An extension to the filtering/classification architecture of Linux Traffic -Control. -Up to 2.6.8 the only action that could be "attached" to a filter was policing. +Control. +Up to 2.6.8 the only action that could be "attached" to a filter was policing. i.e you could say something like: ----- @@ -17,7 +17,7 @@ tc filter add dev lo parent ffff: protocol ip prio 10 u32 match ip src \ which implies "if a packet is seen on the ingress of the lo device with a source IP address of 127.0.0.1/32 we give it a classification id of 1:1 and -we execute a policing action which rate limits its bandwidth utilization +we execute a policing action which rate limits its bandwidth utilization to 1.5Mbps". The new extensions allow for more than just policing actions to be added. @@ -29,9 +29,9 @@ syntax which will work fine. Of course to get the required effect you need both newer tc and kernel. If you are reading this you have the right tc ;-> -A side effect is that we can now get stateless firewalling to work with tc. +A side effect is that we can now get stateless firewalling to work with tc. Essentially this is now an alternative to iptables. -I won't go into details of my dislike for iptables at times, but +I won't go into details of my dislike for iptables at times, but scalability is one of the main issues; however, if you need stateful classification - use netfilter (for now). @@ -61,7 +61,7 @@ tc filter add dev lo parent 1:0 protocol ip prio 10 u32 \ match ip src 127.0.0.1/32 flowid 1:1 \ action police mtu 4000 rate 1500kbit burst 90k -" generic Actions" (gact) at the moment are: +" generic Actions" (gact) at the moment are: { drop, pass, reclassify, continue} (If you have others, no listed here give me a reason and we will add them) +drop says to drop the packet @@ -93,43 +93,43 @@ decimal 12, then use flowid 1:c. 3) A feature i call pipe The motivation is derived from Unix pipe mechanism but applied to packets. -Essentially take a matching packet and pass it through +Essentially take a matching packet and pass it through action1 | action2 | action3 etc. You could do something similar to this with the tc policer and the "continue" -operator but this rather restricts it to just the policer and requires -multiple rules (and lookups, hence quiet inefficient); +operator but this rather restricts it to just the policer and requires +multiple rules (and lookups, hence quiet inefficient); -as an example -- and please note that this is just an example _not_ The +as an example -- and please note that this is just an example _not_ The Word Youve Been Waiting For (yes i have had problems giving examples which ended becoming dogma in documents and people modifying them a little -to look clever); +to look clever); -i selected the metering rates to be small so that i can show better how +i selected the metering rates to be small so that i can show better how things work. - -The script below does the following: -- an incoming packet from 10.0.0.21 is first given a firewall mark of 1. -- It is then metered to make sure it does not exceed its allocated rate of +The script below does the following: +- an incoming packet from 10.0.0.21 is first given a firewall mark of 1. + +- It is then metered to make sure it does not exceed its allocated rate of 1Kbps. If it doesn't exceed rate, this is where we terminate action execution. -- If it does exceed its rate, its "color" changes to a mark of 2 and it is +- If it does exceed its rate, its "color" changes to a mark of 2 and it is then passed through a second meter. --The second meter is shared across all flows on that device [i am surpised -that this seems to be not a well know feature of the policer; Bert was telling +-The second meter is shared across all flows on that device [i am surpised +that this seems to be not a well know feature of the policer; Bert was telling me that someone was writing a qdisc just to do sharing across multiple devices; it must be the summer heat again; weve had someone doing that every year around -summer -- the key to sharing is to use a operator "index" in your policer -rules (example "index 20"). All your rules have to use the same index to +summer -- the key to sharing is to use a operator "index" in your policer +rules (example "index 20"). All your rules have to use the same index to share.] - + -If the second meter is exceeded the color of the flow changes further to 3. -We then pass the packet to another meter which is shared across all devices in the system. If this meter is exceeded we drop the packet. -Note the mark can be used further up the system to do things like policy +Note the mark can be used further up the system to do things like policy or more interesting things on the egress. ------------------ cut here ------------------------------- @@ -161,31 +161,31 @@ action ipt -j mark --set-mark 3 \ # and then attempt to borrow from a meter used by all devices in the # system. Should this be exceeded, drop the packet on the floor. action police index 20 mtu 5000 rate 1kbit burst 90k drop ---------------------------------- +--------------------------------- -Now lets see the actions installed with +Now lets see the actions installed with "tc filter show parent ffff: dev eth0" -------- output ----------- jroot# tc filter show parent ffff: dev eth0 -filter protocol ip pref 1 u32 -filter protocol ip pref 1 u32 fh 800: ht divisor 1 -filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:15 +filter protocol ip pref 1 u32 +filter protocol ip pref 1 u32 fh 800: ht divisor 1 +filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:15 - action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING + action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING target MARK set 0x1 index 2 - action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb + action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb - action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING + action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING target MARK set 0x2 index 1 - action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b + action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b - action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING + action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING target MARK set 0x3 index 3 - action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b + action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b match 0a000015/ffffffff at 12 ------------------------------- @@ -209,31 +209,31 @@ Now lets take a look at the stats with "tc -s filter show parent ffff: dev eth0" -------------- jroot# tc -s filter show parent ffff: dev eth0 -filter protocol ip pref 1 u32 -filter protocol ip pref 1 u32 fh 800: ht divisor 1 +filter protocol ip pref 1 u32 +filter protocol ip pref 1 u32 fh 800: ht divisor 1 filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 -5 +5 - action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING + action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING target MARK set 0x1 index 2 - Sent 188832 bytes 2248 pkts (dropped 0, overlimits 0) + Sent 188832 bytes 2248 pkts (dropped 0, overlimits 0) - action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb - Sent 188832 bytes 2248 pkts (dropped 0, overlimits 2122) + action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb + Sent 188832 bytes 2248 pkts (dropped 0, overlimits 2122) - action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING + action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING target MARK set 0x2 index 1 - Sent 178248 bytes 2122 pkts (dropped 0, overlimits 0) + Sent 178248 bytes 2122 pkts (dropped 0, overlimits 0) - action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b - Sent 178248 bytes 2122 pkts (dropped 0, overlimits 1945) + action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b + Sent 178248 bytes 2122 pkts (dropped 0, overlimits 1945) - action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING + action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING target MARK set 0x3 index 3 - Sent 163380 bytes 1945 pkts (dropped 0, overlimits 0) + Sent 163380 bytes 1945 pkts (dropped 0, overlimits 0) - action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b - Sent 163380 bytes 1945 pkts (dropped 0, overlimits 437) + action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b + Sent 163380 bytes 1945 pkts (dropped 0, overlimits 437) match 0a000015/ffffffff at 12 ------------------------------- @@ -254,4 +254,3 @@ At the moment the focus has been on getting the architecture in place. Expect new things in the spurious time i have to work on this (particularly around end of year when i have typically get time off from work). - diff --git a/doc/actions/gact-usage b/doc/actions/gact-usage index de1308dd..5fc3e628 100644 --- a/doc/actions/gact-usage +++ b/doc/actions/gact-usage @@ -1,13 +1,13 @@ gact [RAND] [INDEX] -Where: - ACTION := reclassify | drop | continue | pass | ok +Where: + ACTION := reclassify | drop | continue | pass | ok RAND := random RANDTYPE := netrand | determ VAL : = value not exceeding 10000 INDEX := index value used - + ACTION semantics - pass and ok are equivalent to accept - continue allows to restart classification lookup @@ -42,14 +42,14 @@ filter u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:16 (rule hit 32 suc random type none pass val 0 index 1 ref 1 bind 1 installed 59 sec used 35 sec Sent 1680 bytes 20 pkts (dropped 20, overlimits 0 ) - + ---- # example 2 #allow 1 out 10 randomly using the netrand generator tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \ 10.0.0.9/32 flowid 1:16 action drop random netrand ok 10 - + ping -c 20 10.0.0.9 ---- @@ -59,14 +59,14 @@ filter protocol ip pref 6 u32 filter protocol ip pref 6 u32 fh 800: ht divisor 1 random type netrand pass val 10 index 5 ref 1 bind 1 installed 49 sec used 25 sec Sent 1680 bytes 20 pkts (dropped 16, overlimits 0 ) - + -------- #alternative: deterministically accept every second packet tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \ 10.0.0.9/32 flowid 1:16 action drop random determ ok 2 - + ping -c 20 10.0.0.9 - + tc -s filter show parent ffff: dev eth0 ----- filter protocol ip pref 6 u32 filter protocol ip pref 6 u32 fh 800: ht divisor 1filter protocol ip pref 6 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:16 (rule hit 20 success 20) @@ -76,4 +76,3 @@ filter protocol ip pref 6 u32 filter protocol ip pref 6 u32 fh 800: ht divisor 1 index 4 ref 1 bind 1 installed 118 sec used 82 sec Sent 1680 bytes 20 pkts (dropped 10, overlimits 0 ) ----- - diff --git a/doc/actions/ifb-README b/doc/actions/ifb-README index 63247f3c..5fe91714 100644 --- a/doc/actions/ifb-README +++ b/doc/actions/ifb-README @@ -6,18 +6,18 @@ with a _lot_ less code. Known IMQ/IFB USES ------------------ -As far as i know the reasons listed below is why people use IMQ. +As far as i know the reasons listed below is why people use IMQ. It would be nice to know of anything else that i missed. 1) qdiscs/policies that are per device as opposed to system wide. IFB allows for sharing. 2) Allows for queueing incoming traffic for shaping instead of -dropping. I am not aware of any study that shows policing is +dropping. I am not aware of any study that shows policing is worse than shaping in achieving the end goal of rate control. I would be interested if anyone is experimenting. -3) Very interesting use: if you are serving p2p you may want to give +3) Very interesting use: if you are serving p2p you may want to give preference to your own locally originated traffic (when responses come back) vs someone using your system to do bittorent. So QoSing based on state comes in as the solution. What people did to achieve this was stick @@ -25,17 +25,17 @@ the IMQ somewhere prelocal hook. I think this is a pretty neat feature to have in Linux in general. (i.e not just for IMQ). But i won't go back to putting netfilter hooks in the device to satisfy -this. I also don't think its worth it hacking ifb some more to be +this. I also don't think its worth it hacking ifb some more to be aware of say L3 info and play ip rule tricks to achieve this. --> Instead the plan is to have a conntrack related action. This action will -selectively either query/create conntrack state on incoming packets. -Packets could then be redirected to ifb based on what happens -> eg -on incoming packets; if we find they are of known state we could send to +selectively either query/create conntrack state on incoming packets. +Packets could then be redirected to ifb based on what happens -> eg +on incoming packets; if we find they are of known state we could send to a different queue than one which didn't have existing state. This all however is dependent on whatever rules the admin enters. At the moment this 3rd function does not exist yet. I have decided that -instead of sitting on the patch for another year, to release it and then +instead of sitting on the patch for another year, to release it and then if there is pressure i will add this feature. An example, to provide functionality that most people use IMQ for below: @@ -43,10 +43,10 @@ An example, to provide functionality that most people use IMQ for below: -------- export TC="/sbin/tc" -$TC qdisc add dev ifb0 root handle 1: prio +$TC qdisc add dev ifb0 root handle 1: prio $TC qdisc add dev ifb0 parent 1:1 handle 10: sfq $TC qdisc add dev ifb0 parent 1:2 handle 20: tbf rate 20kbit buffer 1600 limit 3000 -$TC qdisc add dev ifb0 parent 1:3 handle 30: sfq +$TC qdisc add dev ifb0 parent 1:3 handle 30: sfq $TC filter add dev ifb0 protocol ip pref 1 parent 1: handle 1 fw classid 1:1 $TC filter add dev ifb0 protocol ip pref 2 parent 1: handle 2 fw classid 1:2 @@ -54,7 +54,7 @@ ifconfig ifb0 up $TC qdisc add dev eth0 ingress -# redirect all IP packets arriving in eth0 to ifb0 +# redirect all IP packets arriving in eth0 to ifb0 # use mark 1 --> puts them onto class 1:1 $TC filter add dev eth0 parent ffff: protocol ip prio 10 u32 \ match u32 0 0 flowid 1:1 \ @@ -77,44 +77,44 @@ PING 10.22 (10.0.0.22): 56 data bytes --- 10.22 ping statistics --- 3 packets transmitted, 3 packets received, 0% packet loss round-trip min/avg/max = 0.6/1.3/2.8 ms -[root@jzny action-tests]# +[root@jzny action-tests]# ----- Now look at some stats: --- [root@jmandrake]:~# $TC -s filter show parent ffff: dev eth0 -filter protocol ip pref 10 u32 -filter protocol ip pref 10 u32 fh 800: ht divisor 1 -filter protocol ip pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 +filter protocol ip pref 10 u32 +filter protocol ip pref 10 u32 fh 800: ht divisor 1 +filter protocol ip pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 match 00000000/00000000 at 0 - action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING - target MARK set 0x1 - index 1 ref 1 bind 1 installed 4195sec used 27sec - Sent 252 bytes 3 pkts (dropped 0, overlimits 0) + action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING + target MARK set 0x1 + index 1 ref 1 bind 1 installed 4195sec used 27sec + Sent 252 bytes 3 pkts (dropped 0, overlimits 0) action order 2: mirred (Egress Redirect to device ifb0) stolen index 1 ref 1 bind 1 installed 165 sec used 27 sec - Sent 252 bytes 3 pkts (dropped 0, overlimits 0) + Sent 252 bytes 3 pkts (dropped 0, overlimits 0) [root@jmandrake]:~# $TC -s qdisc -qdisc sfq 30: dev ifb0 limit 128p quantum 1514b - Sent 0 bytes 0 pkts (dropped 0, overlimits 0) -qdisc tbf 20: dev ifb0 rate 20Kbit burst 1575b lat 2147.5s - Sent 210 bytes 3 pkts (dropped 0, overlimits 0) -qdisc sfq 10: dev ifb0 limit 128p quantum 1514b - Sent 294 bytes 3 pkts (dropped 0, overlimits 0) +qdisc sfq 30: dev ifb0 limit 128p quantum 1514b + Sent 0 bytes 0 pkts (dropped 0, overlimits 0) +qdisc tbf 20: dev ifb0 rate 20Kbit burst 1575b lat 2147.5s + Sent 210 bytes 3 pkts (dropped 0, overlimits 0) +qdisc sfq 10: dev ifb0 limit 128p quantum 1514b + Sent 294 bytes 3 pkts (dropped 0, overlimits 0) qdisc prio 1: dev ifb0 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 - Sent 504 bytes 6 pkts (dropped 0, overlimits 0) -qdisc ingress ffff: dev eth0 ---------------- - Sent 308 bytes 5 pkts (dropped 0, overlimits 0) + Sent 504 bytes 6 pkts (dropped 0, overlimits 0) +qdisc ingress ffff: dev eth0 ---------------- + Sent 308 bytes 5 pkts (dropped 0, overlimits 0) [root@jmandrake]:~# ifconfig ifb0 -ifb0 Link encap:Ethernet HWaddr 00:00:00:00:00:00 +ifb0 Link encap:Ethernet HWaddr 00:00:00:00:00:00 inet6 addr: fe80::200:ff:fe00:0/64 Scope:Link UP BROADCAST RUNNING NOARP MTU:1500 Metric:1 RX packets:6 errors:0 dropped:3 overruns:0 frame:0 TX packets:3 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:32 + collisions:0 txqueuelen:32 RX bytes:504 (504.0 b) TX bytes:252 (252.0 b) ----- diff --git a/doc/actions/mirred-usage b/doc/actions/mirred-usage index e749eedc..482ff66d 100644 --- a/doc/actions/mirred-usage +++ b/doc/actions/mirred-usage @@ -7,10 +7,10 @@ flow to be mirrored. High end switches typically can select based on more than just a port (eg a 5 tuple classifier). They may also be capable of redirecting. -Usage: +Usage: -mirred [index INDEX] -where: +mirred [index INDEX] +where: DIRECTION := ACTION := INDEX is the specific policy instance id @@ -18,7 +18,7 @@ DEVICENAME is the devicename Direction: - Ingress is not supported at the moment. It will be in the -future as well as mirror/redirecting to a socket. +future as well as mirror/redirecting to a socket. Action: - Mirror takes a copy of the packet and sends it to specified @@ -29,14 +29,14 @@ steals the packet and redirects to specified destination dev. What NOT to do if you don't want your machine to crash: ------------------------------------------------------ -Do not create loops! +Do not create loops! Loops are not hard to create in the egress qdiscs. Here are simple rules to follow if you don't want to get hurt: A) Do not have the same packet go to same netdevice twice in a single graph of policies. Your machine will just hang! -This is design intent _not a bug_ to teach you some lessons. +This is design intent _not a bug_ to teach you some lessons. In the future if there are easy ways to do this in the kernel without affecting other packets not interested in this feature @@ -51,7 +51,7 @@ B) Do not redirect from one IFB device to another. Remember that IFB is a very specialized case of packet redirecting device. Instead of redirecting it puts packets at the exact spot on the stack it found them from. -Redirecting from ifbX->ifbY will actually not crash your machine but your +Redirecting from ifbX->ifbY will actually not crash your machine but your packets will all be dropped (this is much simpler to detect and resolve and is only affecting users of ifb as opposed to the whole stack). @@ -64,7 +64,7 @@ Some examples: 1) Mirror all packets arriving on eth0 to be sent out on eth1. You may have a sniffer or some accounting box hooked up on eth1. - + --- tc qdisc add dev eth0 ingress tc filter add dev eth0 parent ffff: protocol ip prio 10 u32 \ @@ -100,7 +100,7 @@ stack (i.e ping would work). 3) Even more funky example: # -#allow 1 out 10 packets on ingress of lo to randomly make it to the +#allow 1 out 10 packets on ingress of lo to randomly make it to the # host A (Randomness uses the netrand generator) # --- @@ -111,9 +111,9 @@ action mirred egress mirror dev eth0 --- 4) -# for packets from 10.0.0.9 going out on eth0 (could be local -# IP or something # we are forwarding) - -# if exceeding a 100Kbps rate, then redirect to eth1 +# for packets from 10.0.0.9 going out on eth0 (could be local +# IP or something # we are forwarding) - +# if exceeding a 100Kbps rate, then redirect to eth1 # --- @@ -158,7 +158,7 @@ Essentially a good debugging/logging interface (sort of like BSDs speacialized log device does without needing one). If you replace mirror with redirect, those packets will be -blackholed and will never make it out. +blackholed and will never make it out. cheers, jamal diff --git a/include/json_print.h b/include/json_print.h index ee087c3e..dbdc90e2 100644 --- a/include/json_print.h +++ b/include/json_print.h @@ -64,6 +64,7 @@ _PRINT_FUNC(null, const char*); _PRINT_FUNC(string, const char*); _PRINT_FUNC(uint, unsigned int); _PRINT_FUNC(u64, uint64_t); +_PRINT_FUNC(hhu, unsigned char); _PRINT_FUNC(hu, unsigned short); _PRINT_FUNC(hex, unsigned int); _PRINT_FUNC(0xhex, unsigned long long); diff --git a/include/json_writer.h b/include/json_writer.h index 17d409e0..b52dc2d0 100644 --- a/include/json_writer.h +++ b/include/json_writer.h @@ -38,6 +38,7 @@ void jsonw_float_fmt(json_writer_t *self, const char *fmt, double num); void jsonw_uint(json_writer_t *self, unsigned int number); void jsonw_u64(json_writer_t *self, uint64_t number); void jsonw_xint(json_writer_t *self, uint64_t number); +void jsonw_hhu(json_writer_t *self, unsigned char num); void jsonw_hu(json_writer_t *self, unsigned short number); void jsonw_int(json_writer_t *self, int number); void jsonw_s64(json_writer_t *self, int64_t number); @@ -52,6 +53,7 @@ void jsonw_float_field(json_writer_t *self, const char *prop, double num); void jsonw_uint_field(json_writer_t *self, const char *prop, unsigned int num); void jsonw_u64_field(json_writer_t *self, const char *prop, uint64_t num); void jsonw_xint_field(json_writer_t *self, const char *prop, uint64_t num); +void jsonw_hhu_field(json_writer_t *self, const char *prop, unsigned char num); void jsonw_hu_field(json_writer_t *self, const char *prop, unsigned short num); void jsonw_int_field(json_writer_t *self, const char *prop, int num); void jsonw_s64_field(json_writer_t *self, const char *prop, int64_t num); diff --git a/include/libnetlink.h b/include/libnetlink.h index 138840d5..0854d6ad 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -23,6 +23,7 @@ struct rtnl_handle { FILE *dump_fp; #define RTNL_HANDLE_F_LISTEN_ALL_NSID 0x01 #define RTNL_HANDLE_F_SUPPRESS_NLERR 0x02 +#define RTNL_HANDLE_F_STRICT_CHK 0x04 int flags; }; @@ -46,16 +47,22 @@ int rtnl_open_byproto(struct rtnl_handle *rth, unsigned int subscriptions, __attribute__((warn_unused_result)); void rtnl_close(struct rtnl_handle *rth); +void rtnl_set_strict_dump(struct rtnl_handle *rth); -int rtnl_addrdump_req(struct rtnl_handle *rth, int family) +typedef int (*req_filter_fn_t)(struct nlmsghdr *nlh, int reqlen); + +int rtnl_addrdump_req(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) __attribute__((warn_unused_result)); int rtnl_addrlbldump_req(struct rtnl_handle *rth, int family) __attribute__((warn_unused_result)); -int rtnl_routedump_req(struct rtnl_handle *rth, int family) +int rtnl_routedump_req(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) __attribute__((warn_unused_result)); int rtnl_ruledump_req(struct rtnl_handle *rth, int family) __attribute__((warn_unused_result)); -int rtnl_neighdump_req(struct rtnl_handle *rth, int family) +int rtnl_neighdump_req(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) __attribute__((warn_unused_result)); int rtnl_neightbldump_req(struct rtnl_handle *rth, int family) __attribute__((warn_unused_result)); @@ -71,8 +78,6 @@ int rtnl_linkdump_req(struct rtnl_handle *rth, int fam) int rtnl_linkdump_req_filter(struct rtnl_handle *rth, int fam, __u32 filt_mask) __attribute__((warn_unused_result)); -typedef int (*req_filter_fn_t)(struct nlmsghdr *nlh, int reqlen); - int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int fam, req_filter_fn_t fn) __attribute__((warn_unused_result)); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ff651ca6..fb541e16 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,14 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK, }; +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, @@ -232,6 +240,20 @@ enum bpf_attach_type { */ #define BPF_F_STRICT_ALIGNMENT (1U << 0) +/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the + * verifier will allow any alignment whatsoever. On platforms + * with strict alignment requirements for loads ands stores (such + * as sparc and mips) the verifier validates that all loads and + * stores provably follow this requirement. This flag turns that + * checking and enforcement off. + * + * It is mostly used for testing when we want to validate the + * context and memory access aspects of the verifier, but because + * of an unaligned access the alignment check would trigger before + * the one we are interested in. + */ +#define BPF_F_ANY_ALIGNMENT (1U << 1) + /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ #define BPF_PSEUDO_MAP_FD 1 @@ -257,9 +279,6 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) -/* flags for BPF_PROG_QUERY */ -#define BPF_F_QUERY_EFFECTIVE (1U << 0) - #define BPF_OBJ_NAME_LEN 16U /* Flags for accessing BPF object */ @@ -269,6 +288,12 @@ enum bpf_attach_type { /* Flag for stack_map, store build_id+offset instead of pointer */ #define BPF_F_STACK_BUILD_ID (1U << 5) +/* Zero-initialize hash function seed. This should only be used for testing. */ +#define BPF_F_ZERO_SEED (1U << 6) + +/* flags for BPF_PROG_QUERY */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, @@ -326,7 +351,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ - __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 kern_version; /* not used */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ @@ -335,6 +360,13 @@ union bpf_attr { * (context accesses, allowed helpers, etc). */ __u32 expected_attach_type; + __u32 prog_btf_fd; /* fd pointing to BTF type data */ + __u32 func_info_rec_size; /* userspace bpf_func_info size */ + __aligned_u64 func_info; /* func info */ + __u32 func_info_cnt; /* number of bpf_func_info records */ + __u32 line_info_rec_size; /* userspace bpf_line_info size */ + __aligned_u64 line_info; /* line info */ + __u32 line_info_cnt; /* number of bpf_line_info records */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -353,8 +385,11 @@ union bpf_attr { struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ __u32 prog_fd; __u32 retval; - __u32 data_size_in; - __u32 data_size_out; + __u32 data_size_in; /* input: len of data_in */ + __u32 data_size_out; /* input/output: len of data_out + * returns ENOSPC if data_out + * is too small. + */ __aligned_u64 data_in; __aligned_u64 data_out; __u32 repeat; @@ -475,18 +510,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_pop_elem(struct bpf_map *map, void *value) - * Description - * Pop an element from *map*. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_map_peek_elem(struct bpf_map *map, void *value) - * Description - * Get an element from *map* without removing it. - * Return - * 0 on success, or a negative error in case of failure. - * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -1910,9 +1933,9 @@ union bpf_attr { * is set to metric from route (IPv4/IPv6 only), and ifindex * is set to the device index of the nexthop from the FIB lookup. * - * *plen* argument is the size of the passed in struct. - * *flags* argument can be a combination of one or more of the - * following values: + * *plen* argument is the size of the passed in struct. + * *flags* argument can be a combination of one or more of the + * following values: * * **BPF_FIB_LOOKUP_DIRECT** * Do a direct table lookup vs full lookup using FIB @@ -1921,9 +1944,9 @@ union bpf_attr { * Perform lookup from an egress perspective (default is * ingress). * - * *ctx* is either **struct xdp_md** for XDP programs or - * **struct sk_buff** tc cls_act programs. - * Return + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * Return * * < 0 if any input argument is invalid * * 0 on success (packet is forwarded, nexthop neighbor exists) * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the @@ -2068,8 +2091,8 @@ union bpf_attr { * translated to a keycode using the rc keymap, and reported as * an input key down event. After a period a key up event is * generated. This period can be extended by calling either - * **bpf_rc_keydown** () again with the same values, or calling - * **bpf_rc_repeat** (). + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). * * Some protocols include a toggle bit, in case the button was * released and pressed again between consecutive scancodes. @@ -2152,21 +2175,22 @@ union bpf_attr { * The *flags* meaning is specific for each map type, * and has to be 0 for cgroup local storage. * - * Depending on the bpf program type, a local storage area - * can be shared between multiple instances of the bpf program, + * Depending on the BPF program type, a local storage area + * can be shared between multiple instances of the BPF program, * running simultaneously. * * A user should care about the synchronization by himself. - * For example, by using the BPF_STX_XADD instruction to alter + * For example, by using the **BPF_STX_XADD** instruction to alter * the shared data. * Return - * Pointer to the local storage area. + * A pointer to the local storage area. * * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description - * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map - * It checks the selected sk is matching the incoming - * request in the skb. + * Select a **SO_REUSEPORT** socket from a + * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. + * It checks the selected socket is matching the incoming + * request in the socket buffer. * Return * 0 on success, or a negative error in case of failure. * @@ -2174,7 +2198,7 @@ union bpf_attr { * Description * Look for TCP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, - * and if non-NULL, released via **bpf_sk_release**\ (). + * and if non-**NULL**, released via **bpf_sk_release**\ (). * * The *ctx* should point to the context of the program, such as * the skb or socket (depending on the hook in use). This is used @@ -2202,15 +2226,15 @@ union bpf_attr { * This helper is available only if the kernel was compiled with * **CONFIG_NET** configuration option. * Return - * Pointer to *struct bpf_sock*, or NULL in case of failure. - * For sockets with reuseport option, the *struct bpf_sock* - * result is from reuse->socks[] using the hash of the tuple. + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from **reuse->socks**\ [] using the hash of the tuple. * * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for UDP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, - * and if non-NULL, released via **bpf_sk_release**\ (). + * and if non-**NULL**, released via **bpf_sk_release**\ (). * * The *ctx* should point to the context of the program, such as * the skb or socket (depending on the hook in use). This is used @@ -2238,33 +2262,71 @@ union bpf_attr { * This helper is available only if the kernel was compiled with * **CONFIG_NET** configuration option. * Return - * Pointer to *struct bpf_sock*, or NULL in case of failure. - * For sockets with reuseport option, the *struct bpf_sock* - * result is from reuse->socks[] using the hash of the tuple. + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from **reuse->socks**\ [] using the hash of the tuple. * - * int bpf_sk_release(struct bpf_sock *sk) + * int bpf_sk_release(struct bpf_sock *sock) * Description - * Release the reference held by *sock*. *sock* must be a non-NULL - * pointer that was returned from bpf_sk_lookup_xxx\ (). + * Release the reference held by *sock*. *sock* must be a + * non-**NULL** pointer that was returned from + * **bpf_sk_lookup_xxx**\ (). * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) * Description - * For socket policies, insert *len* bytes into msg at offset + * For socket policies, insert *len* bytes into *msg* at offset * *start*. * * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a - * *msg* it may want to insert metadata or options into the msg. + * *msg* it may want to insert metadata or options into the *msg*. * This can later be read and used by any of the lower layer BPF * hooks. * * This helper may fail if under memory pressure (a malloc * fails) in these cases BPF programs will get an appropriate * error and BPF programs will need to handle them. - * * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags) + * Description + * Will remove *pop* bytes from a *msg* starting at byte *start*. + * This may result in **ENOMEM** errors under certain situations if + * an allocation and copy are required due to a full ring buffer. + * However, the helper will try to avoid doing the allocation + * if possible. Other errors can occur if input parameters are + * invalid either due to *start* byte not being valid part of *msg* + * payload and/or *pop* value being to large. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded pointer movement. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2357,7 +2419,9 @@ union bpf_attr { FN(map_push_elem), \ FN(map_pop_elem), \ FN(map_peek_elem), \ - FN(msg_push_data), + FN(msg_push_data), \ + FN(msg_pop_data), \ + FN(rc_pointer_rel), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2474,6 +2538,8 @@ struct __sk_buff { __u32 data_meta; __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); + __u64 tstamp; + __u32 wire_len; }; struct bpf_tunnel_key { @@ -2599,6 +2665,7 @@ struct sk_msg_md { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + __u32 size; /* Total size of sk_msg */ }; struct sk_reuseport_md { @@ -2649,6 +2716,18 @@ struct bpf_prog_info { __u32 nr_jited_func_lens; __aligned_u64 jited_ksyms; __aligned_u64 jited_func_lens; + __u32 btf_id; + __u32 func_info_rec_size; + __aligned_u64 func_info; + __u32 nr_func_info; + __u32 nr_line_info; + __aligned_u64 line_info; + __aligned_u64 jited_line_info; + __u32 nr_jited_line_info; + __u32 line_info_rec_size; + __u32 jited_line_info_rec_size; + __u32 nr_prog_tags; + __aligned_u64 prog_tags; } __attribute__((aligned(8))); struct bpf_map_info { @@ -2960,4 +3039,19 @@ struct bpf_flow_keys { }; }; +struct bpf_func_info { + __u32 insn_off; + __u32 type_id; +}; + +#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) +#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) + +struct bpf_line_info { + __u32 insn_off; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; +}; + #endif /* __LINUX_BPF_H__ */ diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 8d2a8ffa..cb4cf8cc 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -34,13 +34,16 @@ struct btf_type { * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-31: unused + * bits 28-30: unused + * bit 31: kind_flag, currently used by + * struct, union and fwd */ __u32 info; /* "size" is used by INT, ENUM, STRUCT and UNION. * "size" tells the size of the type it is describing. * - * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT. + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC and FUNC_PROTO. * "type" is a type_id referring to another type. */ union { @@ -51,6 +54,7 @@ struct btf_type { #define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KFLAG(info) ((info) >> 31) #define BTF_KIND_UNKN 0 /* Unknown */ #define BTF_KIND_INT 1 /* Integer */ @@ -64,8 +68,10 @@ struct btf_type { #define BTF_KIND_VOLATILE 9 /* Volatile */ #define BTF_KIND_CONST 10 /* Const */ #define BTF_KIND_RESTRICT 11 /* Restrict */ -#define BTF_KIND_MAX 11 -#define NR_BTF_KINDS 12 +#define BTF_KIND_FUNC 12 /* Function */ +#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ +#define BTF_KIND_MAX 13 +#define NR_BTF_KINDS 14 /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. @@ -107,7 +113,29 @@ struct btf_array { struct btf_member { __u32 name_off; __u32 type; - __u32 offset; /* offset in bits */ + /* If the type info kind_flag is set, the btf_member offset + * contains both member bitfield size and bit offset. The + * bitfield size is set for bitfield members. If the type + * info kind_flag is not set, the offset contains only bit + * offset. + */ + __u32 offset; +}; + +/* If the struct/union type info kind_flag is set, the + * following two macros are used to access bitfield_size + * and bit_offset from btf_member.offset. + */ +#define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) +#define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) + +/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". + * The exact number of btf_param is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_param { + __u32 name_off; + __u32 type; }; #endif /* __LINUX_BTF_H__ */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 5ee0e739..d0a33d79 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -163,6 +163,11 @@ enum devlink_param_cmode { DEVLINK_PARAM_CMODE_MAX = __DEVLINK_PARAM_CMODE_MAX - 1 }; +enum devlink_param_fw_load_policy_value { + DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_DRIVER, + DEVLINK_PARAM_FW_LOAD_POLICY_VALUE_FLASH, +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index bdfecf94..04f763cf 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -292,4 +292,25 @@ struct br_mcast_stats { __u64 mcast_bytes[BR_MCAST_DIR_SIZE]; __u64 mcast_packets[BR_MCAST_DIR_SIZE]; }; + +/* bridge boolean options + * BR_BOOLOPT_NO_LL_LEARN - disable learning from link-local packets + * + * IMPORTANT: if adding a new option do not forget to handle + * it in br_boolopt_toggle/get and bridge sysfs + */ +enum br_boolopt_id { + BR_BOOLOPT_NO_LL_LEARN, + BR_BOOLOPT_MAX +}; + +/* struct br_boolopt_multi - change multiple bridge boolean options + * + * @optval: new option values (bit per option) + * @optmask: options to change (bit per option) + */ +struct br_boolopt_multi { + __u32 optval; + __u32 optmask; +}; #endif /* _LINUX_IF_BRIDGE_H */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 52e95197..484ddf83 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -286,6 +286,7 @@ enum { IFLA_BR_MCAST_IGMP_VERSION, IFLA_BR_MCAST_MLD_VERSION, IFLA_BR_VLAN_STATS_PER_PORT, + IFLA_BR_MULTI_BOOLOPT, __IFLA_BR_MAX, }; @@ -531,6 +532,7 @@ enum { IFLA_VXLAN_LABEL, IFLA_VXLAN_GPE, IFLA_VXLAN_TTL_INHERIT, + IFLA_VXLAN_DF, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) @@ -540,6 +542,14 @@ struct ifla_vxlan_port_range { __be16 high; }; +enum ifla_vxlan_df { + VXLAN_DF_UNSET = 0, + VXLAN_DF_SET, + VXLAN_DF_INHERIT, + __VXLAN_DF_END, + VXLAN_DF_MAX = __VXLAN_DF_END - 1, +}; + /* GENEVE section */ enum { IFLA_GENEVE_UNSPEC, @@ -555,10 +565,19 @@ enum { IFLA_GENEVE_UDP_ZERO_CSUM6_RX, IFLA_GENEVE_LABEL, IFLA_GENEVE_TTL_INHERIT, + IFLA_GENEVE_DF, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) +enum ifla_geneve_df { + GENEVE_DF_UNSET = 0, + GENEVE_DF_SET, + GENEVE_DF_INHERIT, + __GENEVE_DF_END, + GENEVE_DF_MAX = __GENEVE_DF_END - 1, +}; + /* PPP section */ enum { IFLA_PPP_UNSPEC, diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index be9b744a..2f011655 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -59,6 +59,7 @@ #define TUNGETVNETBE _IOR('T', 223, int) #define TUNSETSTEERINGEBPF _IOR('T', 224, int) #define TUNSETFILTEREBPF _IOR('T', 225, int) +#define TUNSETCARRIER _IOW('T', 226, int) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 99815544..cd144e30 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -28,6 +28,7 @@ enum { NDA_MASTER, NDA_LINK_NETNSID, NDA_SRC_VNI, + NDA_PROTOCOL, /* Originator of entry */ __NDA_MAX }; diff --git a/include/uapi/linux/net_namespace.h b/include/uapi/linux/net_namespace.h index 6d64d071..fa81f1e5 100644 --- a/include/uapi/linux/net_namespace.h +++ b/include/uapi/linux/net_namespace.h @@ -16,6 +16,8 @@ enum { NETNSA_NSID, NETNSA_PID, NETNSA_FD, + NETNSA_TARGET_NSID, + NETNSA_CURRENT_NSID, __NETNSA_MAX, }; diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index 36378a0a..899be986 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -33,10 +33,6 @@ #define NF_DROP_ERR(x) (((-x) << 16) | NF_DROP) /* only for userspace compatibility */ -/* Generic cache responses from hook functions. - <= 0x2000 is used for protocol-flags. */ -#define NFC_UNKNOWN 0x4000 -#define NFC_ALTERED 0x8000 /* NF_VERDICT_BITS should be 8 now, but userspace might break if this changes */ #define NF_VERDICT_BITS 16 diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 13eeada5..153c517a 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -13,8 +13,9 @@ #include -/* The protocol version */ -#define IPSET_PROTOCOL 6 +/* The protocol versions */ +#define IPSET_PROTOCOL 7 +#define IPSET_PROTOCOL_MIN 6 /* The max length of strings including NUL: set and type identifiers */ #define IPSET_MAXNAMELEN 32 @@ -38,17 +39,19 @@ enum ipset_cmd { IPSET_CMD_TEST, /* 11: Test an element in a set */ IPSET_CMD_HEADER, /* 12: Get set header data only */ IPSET_CMD_TYPE, /* 13: Get set type */ + IPSET_CMD_GET_BYNAME, /* 14: Get set index by name */ + IPSET_CMD_GET_BYINDEX, /* 15: Get set name by index */ IPSET_MSG_MAX, /* Netlink message commands */ /* Commands in userspace: */ - IPSET_CMD_RESTORE = IPSET_MSG_MAX, /* 14: Enter restore mode */ - IPSET_CMD_HELP, /* 15: Get help */ - IPSET_CMD_VERSION, /* 16: Get program version */ - IPSET_CMD_QUIT, /* 17: Quit from interactive mode */ + IPSET_CMD_RESTORE = IPSET_MSG_MAX, /* 16: Enter restore mode */ + IPSET_CMD_HELP, /* 17: Get help */ + IPSET_CMD_VERSION, /* 18: Get program version */ + IPSET_CMD_QUIT, /* 19: Quit from interactive mode */ IPSET_CMD_MAX, - IPSET_CMD_COMMIT = IPSET_CMD_MAX, /* 18: Commit buffered commands */ + IPSET_CMD_COMMIT = IPSET_CMD_MAX, /* 20: Commit buffered commands */ }; /* Attributes at command level */ @@ -66,6 +69,7 @@ enum { IPSET_ATTR_LINENO, /* 9: Restore lineno */ IPSET_ATTR_PROTOCOL_MIN, /* 10: Minimal supported version number */ IPSET_ATTR_REVISION_MIN = IPSET_ATTR_PROTOCOL_MIN, /* type rev min */ + IPSET_ATTR_INDEX, /* 11: Kernel index of set */ __IPSET_ATTR_CMD_MAX, }; #define IPSET_ATTR_CMD_MAX (__IPSET_ATTR_CMD_MAX - 1) @@ -223,6 +227,7 @@ enum ipset_adt { /* Sets are identified by an index in kernel space. Tweak with ip_set_id_t * and IPSET_INVALID_ID if you want to increase the max number of sets. + * Also, IPSET_ATTR_INDEX must be changed. */ typedef __u16 ip_set_id_t; diff --git a/include/uapi/linux/netfilter_ipv4.h b/include/uapi/linux/netfilter_ipv4.h index 074e2c8b..96979e37 100644 --- a/include/uapi/linux/netfilter_ipv4.h +++ b/include/uapi/linux/netfilter_ipv4.h @@ -12,34 +12,6 @@ #include /* for INT_MIN, INT_MAX */ -/* IP Cache bits. */ -/* Src IP address. */ -#define NFC_IP_SRC 0x0001 -/* Dest IP address. */ -#define NFC_IP_DST 0x0002 -/* Input device. */ -#define NFC_IP_IF_IN 0x0004 -/* Output device. */ -#define NFC_IP_IF_OUT 0x0008 -/* TOS. */ -#define NFC_IP_TOS 0x0010 -/* Protocol. */ -#define NFC_IP_PROTO 0x0020 -/* IP options. */ -#define NFC_IP_OPTIONS 0x0040 -/* Frag & flags. */ -#define NFC_IP_FRAG 0x0080 - -/* Per-protocol information: only matters if proto match. */ -/* TCP flags. */ -#define NFC_IP_TCPFLAGS 0x0100 -/* Source port. */ -#define NFC_IP_SRC_PT 0x0200 -/* Dest port. */ -#define NFC_IP_DST_PT 0x0400 -/* Something else about the proto */ -#define NFC_IP_PROTO_UNKNOWN 0x2000 - /* IP Hooks */ /* After promisc drops, checksum checks. */ #define NF_IP_PRE_ROUTING 0 diff --git a/include/uapi/linux/netfilter_ipv6.h b/include/uapi/linux/netfilter_ipv6.h index 92701fe8..eedf7a2d 100644 --- a/include/uapi/linux/netfilter_ipv6.h +++ b/include/uapi/linux/netfilter_ipv6.h @@ -15,35 +15,6 @@ #include /* for INT_MIN, INT_MAX */ -/* IP Cache bits. */ -/* Src IP address. */ -#define NFC_IP6_SRC 0x0001 -/* Dest IP address. */ -#define NFC_IP6_DST 0x0002 -/* Input device. */ -#define NFC_IP6_IF_IN 0x0004 -/* Output device. */ -#define NFC_IP6_IF_OUT 0x0008 -/* TOS. */ -#define NFC_IP6_TOS 0x0010 -/* Protocol. */ -#define NFC_IP6_PROTO 0x0020 -/* IP options. */ -#define NFC_IP6_OPTIONS 0x0040 -/* Frag & flags. */ -#define NFC_IP6_FRAG 0x0080 - - -/* Per-protocol information: only matters if proto match. */ -/* TCP flags. */ -#define NFC_IP6_TCPFLAGS 0x0100 -/* Source port. */ -#define NFC_IP6_SRC_PT 0x0200 -/* Dest port. */ -#define NFC_IP6_DST_PT 0x0400 -/* Something else about the proto */ -#define NFC_IP6_PROTO_UNKNOWN 0x2000 - /* IP6 Hooks */ /* After promisc drops, checksum checks. */ #define NF_IP6_PRE_ROUTING 0 diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 401d0c1e..95d0db2a 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -485,6 +485,11 @@ enum { TCA_FLOWER_IN_HW_COUNT, + TCA_FLOWER_KEY_PORT_SRC_MIN, /* be16 */ + TCA_FLOWER_KEY_PORT_SRC_MAX, /* be16 */ + TCA_FLOWER_KEY_PORT_DST_MIN, /* be16 */ + TCA_FLOWER_KEY_PORT_DST_MAX, /* be16 */ + __TCA_FLOWER_MAX, }; @@ -518,6 +523,8 @@ enum { TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), }; +#define TCA_FLOWER_MASK_FLAGS_RANGE (1 << 0) /* Range-based match */ + /* Match-all classifier */ enum { diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 89ee47c2..0d18b1d1 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -291,11 +291,38 @@ enum { TCA_GRED_DPS, TCA_GRED_MAX_P, TCA_GRED_LIMIT, + TCA_GRED_VQ_LIST, /* nested TCA_GRED_VQ_ENTRY */ __TCA_GRED_MAX, }; #define TCA_GRED_MAX (__TCA_GRED_MAX - 1) +enum { + TCA_GRED_VQ_ENTRY_UNSPEC, + TCA_GRED_VQ_ENTRY, /* nested TCA_GRED_VQ_* */ + __TCA_GRED_VQ_ENTRY_MAX, +}; +#define TCA_GRED_VQ_ENTRY_MAX (__TCA_GRED_VQ_ENTRY_MAX - 1) + +enum { + TCA_GRED_VQ_UNSPEC, + TCA_GRED_VQ_PAD, + TCA_GRED_VQ_DP, /* u32 */ + TCA_GRED_VQ_STAT_BYTES, /* u64 */ + TCA_GRED_VQ_STAT_PACKETS, /* u32 */ + TCA_GRED_VQ_STAT_BACKLOG, /* u32 */ + TCA_GRED_VQ_STAT_PROB_DROP, /* u32 */ + TCA_GRED_VQ_STAT_PROB_MARK, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_DROP, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_MARK, /* u32 */ + TCA_GRED_VQ_STAT_PDROP, /* u32 */ + TCA_GRED_VQ_STAT_OTHER, /* u32 */ + TCA_GRED_VQ_FLAGS, /* u32 */ + __TCA_GRED_VQ_MAX +}; + +#define TCA_GRED_VQ_MAX (__TCA_GRED_VQ_MAX - 1) + struct tc_gred_qopt { __u32 limit; /* HARD maximal queue length (bytes) */ __u32 qth_min; /* Min average length threshold (bytes) */ @@ -864,6 +891,8 @@ enum { TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */ + TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */ + __TCA_FQ_MAX }; @@ -882,6 +911,7 @@ struct tc_fq_qd_stats { __u32 inactive_flows; __u32 throttled_flows; __u32 unthrottle_latency_ns; + __u64 ce_mark; /* packets above ce_threshold */ }; /* Heavy-Hitter Filter */ diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index c4d3a8a2..e9970b69 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -129,6 +129,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_STREAM_SCHEDULER_VALUE 124 #define SCTP_INTERLEAVING_SUPPORTED 125 #define SCTP_SENDMSG_CONNECT 126 +#define SCTP_EVENT 127 /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 @@ -632,7 +633,9 @@ union sctp_notification { */ enum sctp_sn_type { - SCTP_SN_TYPE_BASE = (1<<15), + SCTP_SN_TYPE_BASE = (1<<15), + SCTP_DATA_IO_EVENT = SCTP_SN_TYPE_BASE, +#define SCTP_DATA_IO_EVENT SCTP_DATA_IO_EVENT SCTP_ASSOC_CHANGE, #define SCTP_ASSOC_CHANGE SCTP_ASSOC_CHANGE SCTP_PEER_ADDR_CHANGE, @@ -657,6 +660,8 @@ enum sctp_sn_type { #define SCTP_ASSOC_RESET_EVENT SCTP_ASSOC_RESET_EVENT SCTP_STREAM_CHANGE_EVENT, #define SCTP_STREAM_CHANGE_EVENT SCTP_STREAM_CHANGE_EVENT + SCTP_SN_TYPE_MAX = SCTP_STREAM_CHANGE_EVENT, +#define SCTP_SN_TYPE_MAX SCTP_SN_TYPE_MAX }; /* Notification error codes used to fill up the error fields in some @@ -1144,6 +1149,12 @@ struct sctp_add_streams { uint16_t sas_outstrms; }; +struct sctp_event { + sctp_assoc_t se_assoc_id; + uint16_t se_type; + uint8_t se_on; +}; + /* SCTP Stream schedulers */ enum sctp_sched_type { SCTP_SS_FCFS, diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index f80135e5..86dc24a9 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -243,6 +243,7 @@ enum LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */ LINUX_MIB_TCPRETRANSFAIL, /* TCPRetransFail */ LINUX_MIB_TCPRCVCOALESCE, /* TCPRcvCoalesce */ + LINUX_MIB_TCPBACKLOGCOALESCE, /* TCPBacklogCoalesce */ LINUX_MIB_TCPOFOQUEUE, /* TCPOFOQueue */ LINUX_MIB_TCPOFODROP, /* TCPOFODrop */ LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 6ec77662..799b5c5f 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -266,6 +266,7 @@ enum { TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */ TCP_NLA_DSACK_DUPS, /* DSACK blocks received */ TCP_NLA_REORD_SEEN, /* reordering events seen */ + TCP_NLA_SRTT, /* smoothed RTT in usecs */ }; /* for TCP_MD5SIG socket option */ diff --git a/include/utils.h b/include/utils.h index 1630dd0b..92bbe82d 100644 --- a/include/utils.h +++ b/include/utils.h @@ -116,13 +116,6 @@ struct dn_naddr unsigned char a_addr[DN_MAXADDL]; }; -#define IPX_NODE_LEN 6 - -struct ipx_addr { - u_int32_t ipx_net; - u_int8_t ipx_node[IPX_NODE_LEN]; -}; - #ifndef AF_MPLS # define AF_MPLS 28 #endif @@ -201,12 +194,6 @@ int matches(const char *arg, const char *pattern); int inet_addr_match(const inet_prefix *a, const inet_prefix *b, int bits); int inet_addr_match_rta(const inet_prefix *m, const struct rtattr *rta); -const char *dnet_ntop(int af, const void *addr, char *str, size_t len); -int dnet_pton(int af, const char *src, void *addr); - -const char *ipx_ntop(int af, const void *addr, char *str, size_t len); -int ipx_pton(int af, const char *src, void *addr); - const char *mpls_ntop(int af, const void *addr, char *str, size_t len); int mpls_pton(int af, const char *src, void *addr, size_t alen); diff --git a/ip/ip.c b/ip/ip.c index c324120f..e4131714 100644 --- a/ip/ip.c +++ b/ip/ip.c @@ -53,7 +53,7 @@ static void usage(void) " vrf | sr }\n" " OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n" " -h[uman-readable] | -iec | -j[son] | -p[retty] |\n" -" -f[amily] { inet | inet6 | ipx | dnet | mpls | bridge | link } |\n" +" -f[amily] { inet | inet6 | mpls | bridge | link } |\n" " -4 | -6 | -I | -D | -M | -B | -0 |\n" " -l[oops] { maximum-addr-flush-attempts } | -br[ief] |\n" " -o[neline] | -t[imestamp] | -ts[hort] | -b[atch] [filename] |\n" @@ -225,8 +225,6 @@ int main(int argc, char **argv) preferred_family = AF_INET6; } else if (strcmp(opt, "-0") == 0) { preferred_family = AF_PACKET; - } else if (strcmp(opt, "-I") == 0) { - preferred_family = AF_IPX; } else if (strcmp(opt, "-D") == 0) { preferred_family = AF_DECnet; } else if (strcmp(opt, "-M") == 0) { @@ -310,6 +308,8 @@ int main(int argc, char **argv) if (rtnl_open(&rth, 0) < 0) exit(1); + rtnl_set_strict_dump(&rth); + if (strlen(basename) > 2) return do_cmd(basename+2, argc, argv); diff --git a/ip/ip_common.h b/ip/ip_common.h index 53668f59..d67575c6 100644 --- a/ip/ip_common.h +++ b/ip/ip_common.h @@ -84,8 +84,7 @@ int do_seg6(int argc, char **argv); int iplink_get(char *name, __u32 filt_mask); int iplink_ifla_xstats(int argc, char **argv); -int ip_linkaddr_list(int family, req_filter_fn_t filter_fn, - struct nlmsg_chain *linfo, struct nlmsg_chain *ainfo); +int ip_link_list(req_filter_fn_t filter_fn, struct nlmsg_chain *linfo); void free_nlmsg_chain(struct nlmsg_chain *info); static inline int rtm_get_table(struct rtmsg *r, struct rtattr **tb) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 016662e9..2bc33f3a 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -1679,6 +1679,15 @@ static void ipaddr_filter(struct nlmsg_chain *linfo, struct nlmsg_chain *ainfo) } } +static int ipaddr_dump_filter(struct nlmsghdr *nlh, int reqlen) +{ + struct ifaddrmsg *ifa = NLMSG_DATA(nlh); + + ifa->ifa_index = filter.ifindex; + + return 0; +} + static int ipaddr_flush(void) { int round = 0; @@ -1689,7 +1698,8 @@ static int ipaddr_flush(void) filter.flushe = sizeof(flushb); while ((max_flush_loops == 0) || (round < max_flush_loops)) { - if (rtnl_addrdump_req(&rth, filter.family) < 0) { + if (rtnl_addrdump_req(&rth, filter.family, + ipaddr_dump_filter) < 0) { perror("Cannot send dump request"); exit(1); } @@ -1762,12 +1772,41 @@ static int iplink_filter_req(struct nlmsghdr *nlh, int reqlen) return 0; } +static int ipaddr_link_get(int index, struct nlmsg_chain *linfo) +{ + struct iplink_req req = { + .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .n.nlmsg_flags = NLM_F_REQUEST, + .n.nlmsg_type = RTM_GETLINK, + .i.ifi_family = filter.family, + .i.ifi_index = index, + }; + __u32 filt_mask = RTEXT_FILTER_VF; + struct nlmsghdr *answer; + + if (!show_stats) + filt_mask |= RTEXT_FILTER_SKIP_STATS; + + addattr32(&req.n, sizeof(req), IFLA_EXT_MASK, filt_mask); + + if (rtnl_talk(&rth, &req.n, &answer) < 0) { + perror("Cannot send link request"); + return 1; + } + + if (store_nlmsg(answer, linfo) < 0) { + fprintf(stderr, "Failed to process link information\n"); + return 1; + } + + return 0; +} + /* fills in linfo with link data and optionally ainfo with address info * caller can walk lists as desired and must call free_nlmsg_chain for * both when done */ -int ip_linkaddr_list(int family, req_filter_fn_t filter_fn, - struct nlmsg_chain *linfo, struct nlmsg_chain *ainfo) +int ip_link_list(req_filter_fn_t filter_fn, struct nlmsg_chain *linfo) { if (rtnl_linkdump_req_filter_fn(&rth, preferred_family, filter_fn) < 0) { @@ -1780,16 +1819,19 @@ int ip_linkaddr_list(int family, req_filter_fn_t filter_fn, return 1; } - if (ainfo) { - if (rtnl_addrdump_req(&rth, family) < 0) { - perror("Cannot send dump request"); - return 1; - } + return 0; +} - if (rtnl_dump_filter(&rth, store_nlmsg, ainfo) < 0) { - fprintf(stderr, "Dump terminated\n"); - return 1; - } +static int ip_addr_list(struct nlmsg_chain *ainfo) +{ + if (rtnl_addrdump_req(&rth, filter.family, ipaddr_dump_filter) < 0) { + perror("Cannot send dump request"); + return 1; + } + + if (rtnl_dump_filter(&rth, store_nlmsg, ainfo) < 0) { + fprintf(stderr, "Dump terminated\n"); + return 1; } return 0; @@ -1798,7 +1840,7 @@ int ip_linkaddr_list(int family, req_filter_fn_t filter_fn, static int ipaddr_list_flush_or_save(int argc, char **argv, int action) { struct nlmsg_chain linfo = { NULL, NULL}; - struct nlmsg_chain _ainfo = { NULL, NULL}, *ainfo = NULL; + struct nlmsg_chain _ainfo = { NULL, NULL}, *ainfo = &_ainfo; struct nlmsg_list *l; char *filter_dev = NULL; int no_link = 0; @@ -1906,7 +1948,8 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action) if (ipadd_save_prep()) exit(1); - if (rtnl_addrdump_req(&rth, preferred_family) < 0) { + if (rtnl_addrdump_req(&rth, preferred_family, + ipaddr_dump_filter) < 0) { perror("Cannot send dump request"); exit(1); } @@ -1940,19 +1983,23 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action) goto out; } - if (filter.family != AF_PACKET) { - ainfo = &_ainfo; - - if (filter.oneline) - no_link = 1; + if (filter.ifindex) { + if (ipaddr_link_get(filter.ifindex, &linfo) != 0) + goto out; + } else { + if (ip_link_list(iplink_filter_req, &linfo) != 0) + goto out; } - if (ip_linkaddr_list(filter.family, iplink_filter_req, - &linfo, ainfo) != 0) - goto out; + if (filter.family != AF_PACKET) { + if (filter.oneline) + no_link = 1; + + if (ip_addr_list(ainfo) != 0) + goto out; - if (filter.family != AF_PACKET) ipaddr_filter(&linfo, ainfo); + } for (l = linfo.head; l; l = l->next) { struct nlmsghdr *n = &l->h; @@ -1971,8 +2018,7 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action) fflush(stdout); out: - if (ainfo) - free_nlmsg_chain(ainfo); + free_nlmsg_chain(ainfo); free_nlmsg_chain(&linfo); delete_json_obj(); return 0; diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c index f1a12f45..278a6e23 100644 --- a/ip/iplink_geneve.c +++ b/ip/iplink_geneve.c @@ -24,6 +24,7 @@ static void print_explain(FILE *f) " remote ADDR\n" " [ ttl TTL ]\n" " [ tos TOS ]\n" + " [ df DF ]\n" " [ flowlabel LABEL ]\n" " [ dstport PORT ]\n" " [ [no]external ]\n" @@ -35,6 +36,7 @@ static void print_explain(FILE *f) " ADDR := IP_ADDRESS\n" " TOS := { NUMBER | inherit }\n" " TTL := { 1..255 | auto | inherit }\n" + " DF := { unset | set | inherit }\n" " LABEL := 0-1048575\n" ); } @@ -115,6 +117,22 @@ static int geneve_parse_opt(struct link_util *lu, int argc, char **argv, tos = uval; } else tos = 1; + } else if (!matches(*argv, "df")) { + enum ifla_geneve_df df; + + NEXT_ARG(); + check_duparg(&attrs, IFLA_GENEVE_DF, "df", *argv); + if (strcmp(*argv, "unset") == 0) + df = GENEVE_DF_UNSET; + else if (strcmp(*argv, "set") == 0) + df = GENEVE_DF_SET; + else if (strcmp(*argv, "inherit") == 0) + df = GENEVE_DF_INHERIT; + else + invarg("DF must be 'unset', 'set' or 'inherit'", + *argv); + + addattr8(n, 1024, IFLA_GENEVE_DF, df); } else if (!matches(*argv, "label") || !matches(*argv, "flowlabel")) { __u32 uval; @@ -287,6 +305,17 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) print_string(PRINT_FP, NULL, "tos %s ", "inherit"); } + if (tb[IFLA_GENEVE_DF]) { + enum ifla_geneve_df df = rta_getattr_u8(tb[IFLA_GENEVE_DF]); + + if (df == GENEVE_DF_UNSET) + print_string(PRINT_JSON, "df", "df %s ", "unset"); + else if (df == GENEVE_DF_SET) + print_string(PRINT_ANY, "df", "df %s ", "set"); + else if (df == GENEVE_DF_INHERIT) + print_string(PRINT_ANY, "df", "df %s ", "inherit"); + } + if (tb[IFLA_GENEVE_LABEL]) { __u32 label = rta_getattr_u32(tb[IFLA_GENEVE_LABEL]); diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c index 62e76943..497affc0 100644 --- a/ip/iplink_vxlan.c +++ b/ip/iplink_vxlan.c @@ -31,6 +31,7 @@ static void print_explain(FILE *f) " [ local ADDR ]\n" " [ ttl TTL ]\n" " [ tos TOS ]\n" + " [ df DF ]\n" " [ flowlabel LABEL ]\n" " [ dev PHYS_DEV ]\n" " [ dstport PORT ]\n" @@ -52,6 +53,7 @@ static void print_explain(FILE *f) " ADDR := { IP_ADDRESS | any }\n" " TOS := { NUMBER | inherit }\n" " TTL := { 1..255 | auto | inherit }\n" + " DF := { unset | set | inherit }\n" " LABEL := 0-1048575\n" ); } @@ -170,6 +172,22 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, } else tos = 1; addattr8(n, 1024, IFLA_VXLAN_TOS, tos); + } else if (!matches(*argv, "df")) { + enum ifla_vxlan_df df; + + NEXT_ARG(); + check_duparg(&attrs, IFLA_VXLAN_DF, "df", *argv); + if (strcmp(*argv, "unset") == 0) + df = VXLAN_DF_UNSET; + else if (strcmp(*argv, "set") == 0) + df = VXLAN_DF_SET; + else if (strcmp(*argv, "inherit") == 0) + df = VXLAN_DF_INHERIT; + else + invarg("DF must be 'unset', 'set' or 'inherit'", + *argv); + + addattr8(n, 1024, IFLA_VXLAN_DF, df); } else if (!matches(*argv, "label") || !matches(*argv, "flowlabel")) { __u32 uval; @@ -538,6 +556,17 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) print_string(PRINT_FP, NULL, "ttl %s ", "auto"); } + if (tb[IFLA_VXLAN_DF]) { + enum ifla_vxlan_df df = rta_getattr_u8(tb[IFLA_VXLAN_DF]); + + if (df == VXLAN_DF_UNSET) + print_string(PRINT_JSON, "df", "df %s ", "unset"); + else if (df == VXLAN_DF_SET) + print_string(PRINT_ANY, "df", "df %s ", "set"); + else if (df == VXLAN_DF_INHERIT) + print_string(PRINT_ANY, "df", "df %s ", "inherit"); + } + if (tb[IFLA_VXLAN_LABEL]) { __u32 label = rta_getattr_u32(tb[IFLA_VXLAN_LABEL]); diff --git a/ip/ipmroute.c b/ip/ipmroute.c index 4d8867d3..b29c78e4 100644 --- a/ip/ipmroute.c +++ b/ip/ipmroute.c @@ -220,21 +220,36 @@ void ipmroute_reset_filter(int ifindex) filter.iif = ifindex; } +static int iproute_dump_filter(struct nlmsghdr *nlh, int reqlen) +{ + int err; + + if (filter.tb) { + err = addattr32(nlh, reqlen, RTA_TABLE, filter.tb); + if (err) + return err; + } + + return 0; +} + static int mroute_list(int argc, char **argv) { char *id = NULL; - int family; + int family = preferred_family; ipmroute_reset_filter(0); - if (preferred_family == AF_UNSPEC) - family = AF_INET; - else - family = AF_INET6; - if (family == AF_INET) { + if (family == AF_INET || family == AF_UNSPEC) { + family = RTNL_FAMILY_IPMR; filter.af = RTNL_FAMILY_IPMR; filter.tb = RT_TABLE_DEFAULT; /* for backward compatibility */ - } else + } else if (family == AF_INET6) { + family = RTNL_FAMILY_IP6MR; filter.af = RTNL_FAMILY_IP6MR; + } else { + /* family does not have multicast routing */ + return 0; + } filter.msrc.family = filter.mdst.family = family; @@ -283,7 +298,7 @@ static int mroute_list(int argc, char **argv) filter.iif = idx; } - if (rtnl_routedump_req(&rth, filter.af) < 0) { + if (rtnl_routedump_req(&rth, filter.af, iproute_dump_filter) < 0) { perror("Cannot send dump request"); return 1; } diff --git a/ip/ipneigh.c b/ip/ipneigh.c index 070b1acd..88596245 100644 --- a/ip/ipneigh.c +++ b/ip/ipneigh.c @@ -40,6 +40,8 @@ static struct int flushp; int flushe; int master; + int protocol; + __u8 ndm_flags; } filter; static void usage(void) __attribute__((noreturn)); @@ -48,7 +50,7 @@ static void usage(void) { fprintf(stderr, "Usage: ip neigh { add | del | change | replace }\n" " { ADDR [ lladdr LLADDR ] [ nud STATE ] | proxy ADDR } [ dev DEV ]\n"); - fprintf(stderr, " [ router ] [ extern_learn ]\n\n"); + fprintf(stderr, " [ router ] [ extern_learn ] [ protocol PROTO ]\n\n"); fprintf(stderr, " ip neigh { show | flush } [ proxy ] [ to PREFIX ] [ dev DEV ] [ nud STATE ]\n"); fprintf(stderr, " [ vrf NAME ]\n\n"); fprintf(stderr, "STATE := { permanent | noarp | stale | reachable | none |\n" @@ -148,6 +150,14 @@ static int ipneigh_modify(int cmd, int flags, int argc, char **argv) NEXT_ARG(); dev = *argv; dev_ok = 1; + } else if (matches(*argv, "protocol") == 0) { + __u32 proto; + + NEXT_ARG(); + if (rtnl_rtprot_a2n(&proto, *argv)) + invarg("\"protocol\" value is invalid\n", *argv); + if (addattr8(&req.n, sizeof(req), NDA_PROTOCOL, proto)) + return -1; } else { if (strcmp(*argv, "to") == 0) { NEXT_ARG(); @@ -244,6 +254,7 @@ int print_neigh(struct nlmsghdr *n, void *arg) int len = n->nlmsg_len; struct rtattr *tb[NDA_MAX+1]; static int logit = 1; + __u8 protocol = 0; if (n->nlmsg_type != RTM_NEWNEIGH && n->nlmsg_type != RTM_DELNEIGH && n->nlmsg_type != RTM_GETNEIGH) { @@ -285,6 +296,12 @@ int print_neigh(struct nlmsghdr *n, void *arg) if (inet_addr_match_rta(&filter.pfx, tb[NDA_DST])) return 0; + if (tb[NDA_PROTOCOL]) + protocol = rta_getattr_u8(tb[NDA_PROTOCOL]); + + if (filter.protocol && filter.protocol != protocol) + return 0; + if (filter.unused_only && tb[NDA_CACHEINFO]) { struct nda_cacheinfo *ci = RTA_DATA(tb[NDA_CACHEINFO]); @@ -379,6 +396,13 @@ int print_neigh(struct nlmsghdr *n, void *arg) if (r->ndm_state) print_neigh_state(r->ndm_state); + if (protocol) { + SPRINT_BUF(b1); + + print_string(PRINT_ANY, "protocol", " proto %s ", + rtnl_rtprot_n2a(protocol, b1, sizeof(b1))); + } + print_string(PRINT_FP, NULL, "\n", ""); close_json_object(); fflush(stdout); @@ -393,16 +417,29 @@ void ipneigh_reset_filter(int ifindex) filter.index = ifindex; } +static int ipneigh_dump_filter(struct nlmsghdr *nlh, int reqlen) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + int err; + + ndm->ndm_flags = filter.ndm_flags; + + if (filter.index) { + err = addattr32(nlh, reqlen, NDA_IFINDEX, filter.index); + if (err) + return err; + } + if (filter.master) { + err = addattr32(nlh, reqlen, NDA_MASTER, filter.master); + if (err) + return err; + } + + return 0; +} + static int do_show_or_flush(int argc, char **argv, int flush) { - struct { - struct nlmsghdr n; - struct ndmsg ndm; - char buf[256]; - } req = { - .n.nlmsg_type = RTM_GETNEIGH, - .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), - }; char *filter_dev = NULL; int state_given = 0; @@ -433,7 +470,6 @@ static int do_show_or_flush(int argc, char **argv, int flush) ifindex = ll_name_to_index(*argv); if (!ifindex) invarg("Device does not exist\n", *argv); - addattr32(&req.n, sizeof(req), NDA_MASTER, ifindex); filter.master = ifindex; } else if (strcmp(*argv, "vrf") == 0) { int ifindex; @@ -444,7 +480,6 @@ static int do_show_or_flush(int argc, char **argv, int flush) invarg("Not a valid VRF name\n", *argv); if (!name_is_vrf(*argv)) invarg("Not a valid VRF name\n", *argv); - addattr32(&req.n, sizeof(req), NDA_MASTER, ifindex); filter.master = ifindex; } else if (strcmp(*argv, "unused") == 0) { filter.unused_only = 1; @@ -466,9 +501,19 @@ static int do_show_or_flush(int argc, char **argv, int flush) if (state == 0) state = 0x100; filter.state |= state; - } else if (strcmp(*argv, "proxy") == 0) - req.ndm.ndm_flags = NTF_PROXY; - else { + } else if (strcmp(*argv, "proxy") == 0) { + filter.ndm_flags = NTF_PROXY; + } else if (matches(*argv, "protocol") == 0) { + __u32 prot; + + NEXT_ARG(); + if (rtnl_rtprot_a2n(&prot, *argv)) { + if (strcmp(*argv, "all")) + invarg("invalid \"protocol\"\n", *argv); + prot = 0; + } + filter.protocol = prot; + } else { if (strcmp(*argv, "to") == 0) { NEXT_ARG(); } @@ -488,11 +533,8 @@ static int do_show_or_flush(int argc, char **argv, int flush) filter.index = ll_name_to_index(filter_dev); if (!filter.index) return nodev(filter_dev); - addattr32(&req.n, sizeof(req), NDA_IFINDEX, filter.index); } - req.ndm.ndm_family = filter.family; - if (flush) { int round = 0; char flushb[4096-512]; @@ -502,7 +544,8 @@ static int do_show_or_flush(int argc, char **argv, int flush) filter.flushe = sizeof(flushb); while (round < MAX_ROUNDS) { - if (rtnl_dump_request_n(&rth, &req.n) < 0) { + if (rtnl_neighdump_req(&rth, filter.family, + ipneigh_dump_filter) < 0) { perror("Cannot send dump request"); exit(1); } @@ -535,7 +578,7 @@ static int do_show_or_flush(int argc, char **argv, int flush) return 1; } - if (rtnl_dump_request_n(&rth, &req.n) < 0) { + if (rtnl_neighdump_req(&rth, filter.family, ipneigh_dump_filter) < 0) { perror("Cannot send dump request"); exit(1); } diff --git a/ip/iproute.c b/ip/iproute.c index fa6a84b5..0440366e 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -83,7 +83,7 @@ static void usage(void) "INFO_SPEC := NH OPTIONS FLAGS [ nexthop NH ]...\n" "NH := [ encap ENCAPTYPE ENCAPHDR ] [ via [ FAMILY ] ADDRESS ]\n" " [ dev STRING ] [ weight NUMBER ] NHFLAGS\n" - "FAMILY := [ inet | inet6 | ipx | dnet | mpls | bridge | link ]\n" + "FAMILY := [ inet | inet6 | mpls | bridge | link ]\n" "OPTIONS := FLAGS [ mtu NUMBER ] [ advmss NUMBER ] [ as [ to ] ADDRESS ]\n" " [ rtt TIME ] [ rttvar TIME ] [ reordering NUMBER ]\n" " [ window NUMBER ] [ cwnd NUMBER ] [ initcwnd NUMBER ]\n" @@ -1535,24 +1535,6 @@ static int iproute_modify(int cmd, unsigned int flags, int argc, char **argv) return 0; } -static int rtnl_rtcache_request(struct rtnl_handle *rth, int family) -{ - struct { - struct nlmsghdr nlh; - struct rtmsg rtm; - } req = { - .nlh.nlmsg_len = sizeof(req), - .nlh.nlmsg_type = RTM_GETROUTE, - .nlh.nlmsg_flags = NLM_F_ROOT | NLM_F_REQUEST, - .nlh.nlmsg_seq = rth->dump = ++rth->seq, - .rtm.rtm_family = family, - .rtm.rtm_flags = RTM_F_CLONED, - }; - struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; - - return sendto(rth->fd, (void *)&req, sizeof(req), 0, (struct sockaddr *)&nladdr, sizeof(nladdr)); -} - static int iproute_flush_cache(void) { #define ROUTE_FLUSH_PATH "/proc/sys/net/ipv4/route/flush" @@ -1622,7 +1604,7 @@ static int save_route_prep(void) return 0; } -static int iproute_flush(int do_ipv6, rtnl_filter_t filter_fn) +static int iproute_flush(int family, rtnl_filter_t filter_fn) { time_t start = time(0); char flushb[4096-512]; @@ -1630,12 +1612,12 @@ static int iproute_flush(int do_ipv6, rtnl_filter_t filter_fn) int ret; if (filter.cloned) { - if (do_ipv6 != AF_INET6) { + if (family != AF_INET6) { iproute_flush_cache(); if (show_stats) printf("*** IPv4 routing cache is flushed.\n"); } - if (do_ipv6 == AF_INET) + if (family == AF_INET) return 0; } @@ -1644,7 +1626,7 @@ static int iproute_flush(int do_ipv6, rtnl_filter_t filter_fn) filter.flushe = sizeof(flushb); for (;;) { - if (rtnl_routedump_req(&rth, do_ipv6) < 0) { + if (rtnl_routedump_req(&rth, family, NULL) < 0) { perror("Cannot send dump request"); return -2; } @@ -1656,7 +1638,7 @@ static int iproute_flush(int do_ipv6, rtnl_filter_t filter_fn) if (filter.flushed == 0) { if (show_stats) { if (round == 0 && - (!filter.cloned || do_ipv6 == AF_INET6)) + (!filter.cloned || family == AF_INET6)) printf("Nothing to flush.\n"); else printf("*** Flush is complete after %d round%s ***\n", @@ -1684,9 +1666,33 @@ static int iproute_flush(int do_ipv6, rtnl_filter_t filter_fn) } } +static int iproute_dump_filter(struct nlmsghdr *nlh, int reqlen) +{ + struct rtmsg *rtm = NLMSG_DATA(nlh); + int err; + + rtm->rtm_protocol = filter.protocol; + if (filter.cloned) + rtm->rtm_flags |= RTM_F_CLONED; + + if (filter.tb) { + err = addattr32(nlh, reqlen, RTA_TABLE, filter.tb); + if (err) + return err; + } + + if (filter.oif) { + err = addattr32(nlh, reqlen, RTA_OIF, filter.oif); + if (err) + return err; + } + + return 0; +} + static int iproute_list_flush_or_save(int argc, char **argv, int action) { - int do_ipv6 = preferred_family; + int dump_family = preferred_family; char *id = NULL; char *od = NULL; unsigned int mark = 0; @@ -1805,13 +1811,13 @@ static int iproute_list_flush_or_save(int argc, char **argv, int action) NEXT_ARG(); family = read_family(*argv); if (family == AF_UNSPEC) - family = do_ipv6; + family = dump_family; else NEXT_ARG(); get_prefix(&filter.rvia, *argv, family); } else if (strcmp(*argv, "src") == 0) { NEXT_ARG(); - get_prefix(&filter.rprefsrc, *argv, do_ipv6); + get_prefix(&filter.rprefsrc, *argv, dump_family); } else if (matches(*argv, "realms") == 0) { __u32 realm; @@ -1831,15 +1837,15 @@ static int iproute_list_flush_or_save(int argc, char **argv, int action) NEXT_ARG(); if (matches(*argv, "root") == 0) { NEXT_ARG(); - get_prefix(&filter.rsrc, *argv, do_ipv6); + get_prefix(&filter.rsrc, *argv, dump_family); } else if (matches(*argv, "match") == 0) { NEXT_ARG(); - get_prefix(&filter.msrc, *argv, do_ipv6); + get_prefix(&filter.msrc, *argv, dump_family); } else { if (matches(*argv, "exact") == 0) { NEXT_ARG(); } - get_prefix(&filter.msrc, *argv, do_ipv6); + get_prefix(&filter.msrc, *argv, dump_family); filter.rsrc = filter.msrc; } } else { @@ -1848,23 +1854,23 @@ static int iproute_list_flush_or_save(int argc, char **argv, int action) } if (matches(*argv, "root") == 0) { NEXT_ARG(); - get_prefix(&filter.rdst, *argv, do_ipv6); + get_prefix(&filter.rdst, *argv, dump_family); } else if (matches(*argv, "match") == 0) { NEXT_ARG(); - get_prefix(&filter.mdst, *argv, do_ipv6); + get_prefix(&filter.mdst, *argv, dump_family); } else { if (matches(*argv, "exact") == 0) { NEXT_ARG(); } - get_prefix(&filter.mdst, *argv, do_ipv6); + get_prefix(&filter.mdst, *argv, dump_family); filter.rdst = filter.mdst; } } argc--; argv++; } - if (do_ipv6 == AF_UNSPEC && filter.tb) - do_ipv6 = AF_INET; + if (dump_family == AF_UNSPEC && filter.tb) + dump_family = AF_INET; if (id || od) { int idx; @@ -1887,18 +1893,11 @@ static int iproute_list_flush_or_save(int argc, char **argv, int action) filter.mark = mark; if (action == IPROUTE_FLUSH) - return iproute_flush(do_ipv6, filter_fn); + return iproute_flush(dump_family, filter_fn); - if (!filter.cloned) { - if (rtnl_routedump_req(&rth, do_ipv6) < 0) { - perror("Cannot send dump request"); - return -2; - } - } else { - if (rtnl_rtcache_request(&rth, do_ipv6) < 0) { - perror("Cannot send dump request"); - return -2; - } + if (rtnl_routedump_req(&rth, dump_family, iproute_dump_filter) < 0) { + perror("Cannot send dump request"); + return -2; } new_json_obj(json); diff --git a/ip/iprule.c b/ip/iprule.c index 9a7173d0..2f58d8c2 100644 --- a/ip/iprule.c +++ b/ip/iprule.c @@ -79,6 +79,9 @@ static struct inet_prefix dst; int protocol; int protocolmask; + struct fib_rule_port_range sport; + struct fib_rule_port_range dport; + __u8 ipproto; } filter; static inline int frh_get_table(struct fib_rule_hdr *frh, struct rtattr **tb) @@ -175,6 +178,39 @@ static bool filter_nlmsg(struct nlmsghdr *n, struct rtattr **tb, int host_len) return false; } + if (filter.ipproto) { + __u8 ipproto = 0; + + if (tb[FRA_IP_PROTO]) + ipproto = rta_getattr_u8(tb[FRA_IP_PROTO]); + if (filter.ipproto != ipproto) + return false; + } + + if (filter.sport.start) { + const struct fib_rule_port_range *r; + + if (!tb[FRA_SPORT_RANGE]) + return false; + + r = RTA_DATA(tb[FRA_SPORT_RANGE]); + if (r->start != filter.sport.start || + r->end != filter.sport.end) + return false; + } + + if (filter.dport.start) { + const struct fib_rule_port_range *r; + + if (!tb[FRA_DPORT_RANGE]) + return false; + + r = RTA_DATA(tb[FRA_DPORT_RANGE]); + if (r->start != filter.dport.start || + r->end != filter.dport.end) + return false; + } + if (filter.tun_id) { __u64 tun_id = 0; @@ -633,6 +669,36 @@ static int iprule_list_flush_or_save(int argc, char **argv, int action) filter.protocolmask = 0; } filter.protocol = prot; + } else if (strcmp(*argv, "ipproto") == 0) { + int ipproto; + + NEXT_ARG(); + ipproto = inet_proto_a2n(*argv); + if (ipproto < 0) + invarg("Invalid \"ipproto\" value\n", *argv); + filter.ipproto = ipproto; + } else if (strcmp(*argv, "sport") == 0) { + struct fib_rule_port_range r; + int ret; + + NEXT_ARG(); + ret = sscanf(*argv, "%hu-%hu", &r.start, &r.end); + if (ret == 1) + r.end = r.start; + else if (ret != 2) + invarg("invalid port range\n", *argv); + filter.sport = r; + } else if (strcmp(*argv, "dport") == 0) { + struct fib_rule_port_range r; + int ret; + + NEXT_ARG(); + ret = sscanf(*argv, "%hu-%hu", &r.start, &r.end); + if (ret == 1) + r.end = r.start; + else if (ret != 2) + invarg("invalid dport range\n", *argv); + filter.dport = r; } else{ if (matches(*argv, "dst") == 0 || matches(*argv, "to") == 0) { diff --git a/ip/ipvrf.c b/ip/ipvrf.c index 8a6b7f97..08a0d45b 100644 --- a/ip/ipvrf.c +++ b/ip/ipvrf.c @@ -589,7 +589,7 @@ static int ipvrf_show(int argc, char **argv) return 0; } - if (ip_linkaddr_list(0, ipvrf_filter_req, &linfo, NULL) == 0) { + if (ip_link_list(ipvrf_filter_req, &linfo) == 0) { struct nlmsg_list *l; unsigned nvrf = 0; int n; diff --git a/lib/dnet_ntop.c b/lib/dnet_ntop.c deleted file mode 100644 index 17d960e3..00000000 --- a/lib/dnet_ntop.c +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include -#include - -#include "utils.h" - -static __inline__ u_int16_t dn_ntohs(u_int16_t addr) -{ - union { - u_int8_t byte[2]; - u_int16_t word; - } u; - - u.word = addr; - return ((u_int16_t)u.byte[0]) | (((u_int16_t)u.byte[1]) << 8); -} - -static __inline__ int do_digit(char *str, u_int16_t *addr, u_int16_t scale, size_t *pos, size_t len, int *started) -{ - u_int16_t tmp = *addr / scale; - - if (*pos == len) - return 1; - - if (((tmp) > 0) || *started || (scale == 1)) { - *str = tmp + '0'; - *started = 1; - (*pos)++; - *addr -= (tmp * scale); - } - - return 0; -} - - -static const char *dnet_ntop1(const struct dn_naddr *dna, char *str, size_t len) -{ - u_int16_t addr, area; - size_t pos = 0; - int started = 0; - - memcpy(&addr, dna->a_addr, sizeof(addr)); - addr = dn_ntohs(addr); - area = addr >> 10; - - if (dna->a_len != 2) - return NULL; - - addr &= 0x03ff; - - if (len == 0) - return str; - - if (do_digit(str + pos, &area, 10, &pos, len, &started)) - return str; - - if (do_digit(str + pos, &area, 1, &pos, len, &started)) - return str; - - if (pos == len) - return str; - - *(str + pos) = '.'; - pos++; - started = 0; - - if (do_digit(str + pos, &addr, 1000, &pos, len, &started)) - return str; - - if (do_digit(str + pos, &addr, 100, &pos, len, &started)) - return str; - - if (do_digit(str + pos, &addr, 10, &pos, len, &started)) - return str; - - if (do_digit(str + pos, &addr, 1, &pos, len, &started)) - return str; - - if (pos == len) - return str; - - *(str + pos) = 0; - - return str; -} - - -const char *dnet_ntop(int af, const void *addr, char *str, size_t len) -{ - switch(af) { - case AF_DECnet: - errno = 0; - return dnet_ntop1((struct dn_naddr *)addr, str, len); - default: - errno = EAFNOSUPPORT; - } - - return NULL; -} diff --git a/lib/dnet_pton.c b/lib/dnet_pton.c deleted file mode 100644 index 1cf54e51..00000000 --- a/lib/dnet_pton.c +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include -#include - -#include "utils.h" - -static __inline__ u_int16_t dn_htons(u_int16_t addr) -{ - union { - u_int8_t byte[2]; - u_int16_t word; - } u; - - u.word = addr; - return ((u_int16_t)u.byte[0]) | (((u_int16_t)u.byte[1]) << 8); -} - - -static int dnet_num(const char *src, u_int16_t * dst) -{ - int rv = 0; - int tmp; - *dst = 0; - - while ((tmp = *src++) != 0) { - tmp -= '0'; - if ((tmp < 0) || (tmp > 9)) - return rv; - - rv++; - (*dst) *= 10; - (*dst) += tmp; - } - - return rv; -} - -static int dnet_pton1(const char *src, struct dn_naddr *dna) -{ - u_int16_t addr; - u_int16_t area = 0; - u_int16_t node = 0; - int pos; - - pos = dnet_num(src, &area); - if ((pos == 0) || (area > 63) || (*(src + pos) != '.')) - return 0; - pos = dnet_num(src + pos + 1, &node); - if ((pos == 0) || (node > 1023)) - return 0; - dna->a_len = 2; - addr = dn_htons((area << 10) | node); - memcpy(dna->a_addr, &addr, sizeof(addr)); - - return 1; -} - -int dnet_pton(int af, const char *src, void *addr) -{ - int err; - - switch (af) { - case AF_DECnet: - errno = 0; - err = dnet_pton1(src, (struct dn_naddr *)addr); - break; - default: - errno = EAFNOSUPPORT; - err = -1; - } - - return err; -} diff --git a/lib/ipx_ntop.c b/lib/ipx_ntop.c deleted file mode 100644 index 80b8a34e..00000000 --- a/lib/ipx_ntop.c +++ /dev/null @@ -1,71 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include -#include - -#include "utils.h" - -static __inline__ int do_digit(char *str, u_int32_t addr, u_int32_t scale, size_t *pos, size_t len) -{ - u_int32_t tmp = addr >> (scale * 4); - - if (*pos == len) - return 1; - - tmp &= 0x0f; - if (tmp > 9) - *str = tmp + 'A' - 10; - else - *str = tmp + '0'; - (*pos)++; - - return 0; -} - -static const char *ipx_ntop1(const struct ipx_addr *addr, char *str, size_t len) -{ - int i; - size_t pos = 0; - - if (len == 0) - return str; - - for(i = 7; i >= 0; i--) - if (do_digit(str + pos, ntohl(addr->ipx_net), i, &pos, len)) - return str; - - if (pos == len) - return str; - - *(str + pos) = '.'; - pos++; - - for(i = 0; i < 6; i++) { - if (do_digit(str + pos, addr->ipx_node[i], 1, &pos, len)) - return str; - if (do_digit(str + pos, addr->ipx_node[i], 0, &pos, len)) - return str; - } - - if (pos == len) - return str; - - *(str + pos) = 0; - - return str; -} - - -const char *ipx_ntop(int af, const void *addr, char *str, size_t len) -{ - switch(af) { - case AF_IPX: - errno = 0; - return ipx_ntop1((struct ipx_addr *)addr, str, len); - default: - errno = EAFNOSUPPORT; - } - - return NULL; -} diff --git a/lib/ipx_pton.c b/lib/ipx_pton.c deleted file mode 100644 index a97c1c1b..00000000 --- a/lib/ipx_pton.c +++ /dev/null @@ -1,97 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include -#include -#include -#include -#include - -#include "utils.h" - -static int ipx_getnet(u_int32_t *net, const char *str) -{ - int i; - u_int32_t tmp; - - for(i = 0; *str && (i < 8); i++) { - - if ((tmp = get_hex(*str)) == -1) { - if (*str == '.') - return 0; - else - return -1; - } - - str++; - (*net) <<= 4; - (*net) |= tmp; - } - - if (*str == 0) - return 0; - - return -1; -} - -static int ipx_getnode(u_int8_t *node, const char *str) -{ - int i; - u_int32_t tmp; - - for(i = 0; i < 6; i++) { - if ((tmp = get_hex(*str++)) == -1) - return -1; - node[i] = (u_int8_t)tmp; - node[i] <<= 4; - if ((tmp = get_hex(*str++)) == -1) - return -1; - node[i] |= (u_int8_t)tmp; - if (*str == ':') - str++; - } - - return 0; -} - -static int ipx_pton1(const char *src, struct ipx_addr *addr) -{ - char *sep = (char *)src; - int no_node = 0; - - memset(addr, 0, sizeof(struct ipx_addr)); - - while(*sep && (*sep != '.')) - sep++; - - if (*sep != '.') - no_node = 1; - - if (ipx_getnet(&addr->ipx_net, src)) - return 0; - - addr->ipx_net = htonl(addr->ipx_net); - - if (no_node) - return 1; - - if (ipx_getnode(addr->ipx_node, sep + 1)) - return 0; - - return 1; -} - -int ipx_pton(int af, const char *src, void *addr) -{ - int err; - - switch (af) { - case AF_IPX: - errno = 0; - err = ipx_pton1(src, (struct ipx_addr *)addr); - break; - default: - errno = EAFNOSUPPORT; - err = -1; - } - - return err; -} diff --git a/lib/json_print.c b/lib/json_print.c index 54fa40cf..4eb2d0dc 100644 --- a/lib/json_print.c +++ b/lib/json_print.c @@ -118,6 +118,7 @@ void close_json_array(enum output_type type, const char *str) } _PRINT_FUNC(int, int); _PRINT_FUNC(s64, int64_t); +_PRINT_FUNC(hhu, unsigned char); _PRINT_FUNC(hu, unsigned short); _PRINT_FUNC(uint, unsigned int); _PRINT_FUNC(u64, uint64_t); diff --git a/lib/json_writer.c b/lib/json_writer.c index 5779ec06..5004c181 100644 --- a/lib/json_writer.c +++ b/lib/json_writer.c @@ -211,6 +211,11 @@ void jsonw_float(json_writer_t *self, double num) jsonw_printf(self, "%g", num); } +void jsonw_hhu(json_writer_t *self, unsigned char num) +{ + jsonw_printf(self, "%hhu", num); +} + void jsonw_hu(json_writer_t *self, unsigned short num) { jsonw_printf(self, "%hu", num); @@ -288,6 +293,12 @@ void jsonw_xint_field(json_writer_t *self, const char *prop, uint64_t num) jsonw_xint(self, num); } +void jsonw_hhu_field(json_writer_t *self, const char *prop, unsigned char num) +{ + jsonw_name(self, prop); + jsonw_hhu(self, num); +} + void jsonw_hu_field(json_writer_t *self, const char *prop, unsigned short num) { jsonw_name(self, prop); diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 95457109..110f47bc 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -28,6 +28,8 @@ #include "libnetlink.h" +#define __aligned(x) __attribute__((aligned(x))) + #ifndef SOL_NETLINK #define SOL_NETLINK 270 #endif @@ -67,6 +69,14 @@ static int err_attr_cb(const struct nlattr *attr, void *data) return MNL_CB_OK; } +static void print_ext_ack_msg(bool is_err, const char *msg) +{ + fprintf(stderr, "%s: %s", is_err ? "Error" : "Warning", msg); + if (msg[strlen(msg) - 1] != '.') + fprintf(stderr, "."); + fprintf(stderr, "\n"); +} + /* dump netlink extended ack error message */ int nl_dump_ext_ack(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn) { @@ -108,12 +118,29 @@ int nl_dump_ext_ack(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn) if (msg && *msg != '\0') { bool is_err = !!err->error; - fprintf(stderr, "%s: %s", - is_err ? "Error" : "Warning", msg); - if (msg[strlen(msg) - 1] != '.') - fprintf(stderr, "."); - fprintf(stderr, "\n"); + print_ext_ack_msg(is_err, msg); + return is_err ? 1 : 0; + } + return 0; +} + +static int nl_dump_ext_ack_done(const struct nlmsghdr *nlh, int error) +{ + struct nlattr *tb[NLMSGERR_ATTR_MAX + 1] = {}; + unsigned int hlen = sizeof(int); + const char *msg = NULL; + + if (mnl_attr_parse(nlh, hlen, err_attr_cb, tb) != MNL_CB_OK) + return 0; + + if (tb[NLMSGERR_ATTR_MSG]) + msg = mnl_attr_get_str(tb[NLMSGERR_ATTR_MSG]); + + if (msg && *msg != '\0') { + bool is_err = !!error; + + print_ext_ack_msg(is_err, msg); return is_err ? 1 : 0; } @@ -127,8 +154,25 @@ int nl_dump_ext_ack(const struct nlmsghdr *nlh, nl_ext_ack_fn_t errfn) { return 0; } + +static int nl_dump_ext_ack_done(const struct nlmsghdr *nlh, int error) +{ + return 0; +} #endif +/* Older kernels may not support strict dump and filtering */ +void rtnl_set_strict_dump(struct rtnl_handle *rth) +{ + int one = 1; + + if (setsockopt(rth->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, + &one, sizeof(one)) < 0) + return; + + rth->flags |= RTNL_HANDLE_F_STRICT_CHK; +} + void rtnl_close(struct rtnl_handle *rth) { if (rth->fd >= 0) { @@ -202,19 +246,29 @@ int rtnl_open(struct rtnl_handle *rth, unsigned int subscriptions) return rtnl_open_byproto(rth, subscriptions, NETLINK_ROUTE); } -int rtnl_addrdump_req(struct rtnl_handle *rth, int family) +int rtnl_addrdump_req(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) { struct { struct nlmsghdr nlh; struct ifaddrmsg ifm; + char buf[128]; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), .nlh.nlmsg_type = RTM_GETADDR, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, .ifm.ifa_family = family, }; + if (filter_fn) { + int err; + + err = filter_fn(&req.nlh, sizeof(req)); + if (err) + return err; + } + return send(rth->fd, &req, sizeof(req), 0); } @@ -224,7 +278,7 @@ int rtnl_addrlbldump_req(struct rtnl_handle *rth, int family) struct nlmsghdr nlh; struct ifaddrlblmsg ifal; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrlblmsg)), .nlh.nlmsg_type = RTM_GETADDRLABEL, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -234,19 +288,29 @@ int rtnl_addrlbldump_req(struct rtnl_handle *rth, int family) return send(rth->fd, &req, sizeof(req), 0); } -int rtnl_routedump_req(struct rtnl_handle *rth, int family) +int rtnl_routedump_req(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) { struct { struct nlmsghdr nlh; struct rtmsg rtm; + char buf[128]; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), .nlh.nlmsg_type = RTM_GETROUTE, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, .rtm.rtm_family = family, }; + if (filter_fn) { + int err; + + err = filter_fn(&req.nlh, sizeof(req)); + if (err) + return err; + } + return send(rth->fd, &req, sizeof(req), 0); } @@ -256,7 +320,7 @@ int rtnl_ruledump_req(struct rtnl_handle *rth, int family) struct nlmsghdr nlh; struct fib_rule_hdr frh; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)), .nlh.nlmsg_type = RTM_GETRULE, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -266,19 +330,29 @@ int rtnl_ruledump_req(struct rtnl_handle *rth, int family) return send(rth->fd, &req, sizeof(req), 0); } -int rtnl_neighdump_req(struct rtnl_handle *rth, int family) +int rtnl_neighdump_req(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) { struct { struct nlmsghdr nlh; struct ndmsg ndm; + char buf[256]; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), .nlh.nlmsg_type = RTM_GETNEIGH, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, .ndm.ndm_family = family, }; + if (filter_fn) { + int err; + + err = filter_fn(&req.nlh, sizeof(req)); + if (err) + return err; + } + return send(rth->fd, &req, sizeof(req), 0); } @@ -288,7 +362,7 @@ int rtnl_neightbldump_req(struct rtnl_handle *rth, int family) struct nlmsghdr nlh; struct ndtmsg ndtmsg; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndtmsg)), .nlh.nlmsg_type = RTM_GETNEIGHTBL, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -304,7 +378,7 @@ int rtnl_mdbdump_req(struct rtnl_handle *rth, int family) struct nlmsghdr nlh; struct br_port_msg bpm; } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct br_port_msg)), .nlh.nlmsg_type = RTM_GETMDB, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -319,8 +393,9 @@ int rtnl_netconfdump_req(struct rtnl_handle *rth, int family) struct { struct nlmsghdr nlh; struct netconfmsg ncm; + char buf[0] __aligned(NLMSG_ALIGNTO); } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct netconfmsg))), .nlh.nlmsg_type = RTM_GETNETCONF, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -335,8 +410,9 @@ int rtnl_nsiddump_req(struct rtnl_handle *rth, int family) struct { struct nlmsghdr nlh; struct rtgenmsg rtm; + char buf[0] __aligned(NLMSG_ALIGNTO); } req = { - .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_len = NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct rtgenmsg))), .nlh.nlmsg_type = RTM_GETNSID, .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, .nlh.nlmsg_seq = rth->dump = ++rth->seq, @@ -346,41 +422,11 @@ int rtnl_nsiddump_req(struct rtnl_handle *rth, int family) return send(rth->fd, &req, sizeof(req), 0); } -int rtnl_linkdump_req(struct rtnl_handle *rth, int family) -{ - return rtnl_linkdump_req_filter(rth, family, RTEXT_FILTER_VF); -} - -int rtnl_linkdump_req_filter(struct rtnl_handle *rth, int family, - __u32 filt_mask) +static int __rtnl_linkdump_req(struct rtnl_handle *rth, int family) { struct { struct nlmsghdr nlh; struct ifinfomsg ifm; - /* attribute has to be NLMSG aligned */ - struct rtattr ext_req __attribute__ ((aligned(NLMSG_ALIGNTO))); - __u32 ext_filter_mask; - } req = { - .nlh.nlmsg_len = sizeof(req), - .nlh.nlmsg_type = RTM_GETLINK, - .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, - .nlh.nlmsg_seq = rth->dump = ++rth->seq, - .ifm.ifi_family = family, - .ext_req.rta_type = IFLA_EXT_MASK, - .ext_req.rta_len = RTA_LENGTH(sizeof(__u32)), - .ext_filter_mask = filt_mask, - }; - - return send(rth->fd, &req, sizeof(req), 0); -} - -int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int family, - req_filter_fn_t filter_fn) -{ - struct { - struct nlmsghdr nlh; - struct ifinfomsg ifm; - char buf[1024]; } req = { .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), .nlh.nlmsg_type = RTM_GETLINK, @@ -388,16 +434,73 @@ int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int family, .nlh.nlmsg_seq = rth->dump = ++rth->seq, .ifm.ifi_family = family, }; - int err; - if (!filter_fn) - return -EINVAL; + return send(rth->fd, &req, sizeof(req), 0); +} - err = filter_fn(&req.nlh, sizeof(req)); - if (err) - return err; +int rtnl_linkdump_req(struct rtnl_handle *rth, int family) +{ + if (family == AF_UNSPEC) + return rtnl_linkdump_req_filter(rth, family, RTEXT_FILTER_VF); - return send(rth->fd, &req, req.nlh.nlmsg_len, 0); + return __rtnl_linkdump_req(rth, family); +} + +int rtnl_linkdump_req_filter(struct rtnl_handle *rth, int family, + __u32 filt_mask) +{ + if (family == AF_UNSPEC || family == AF_BRIDGE) { + struct { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + /* attribute has to be NLMSG aligned */ + struct rtattr ext_req __aligned(NLMSG_ALIGNTO); + __u32 ext_filter_mask; + } req = { + .nlh.nlmsg_len = sizeof(req), + .nlh.nlmsg_type = RTM_GETLINK, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .nlh.nlmsg_seq = rth->dump = ++rth->seq, + .ifm.ifi_family = family, + .ext_req.rta_type = IFLA_EXT_MASK, + .ext_req.rta_len = RTA_LENGTH(sizeof(__u32)), + .ext_filter_mask = filt_mask, + }; + + return send(rth->fd, &req, sizeof(req), 0); + } + + return __rtnl_linkdump_req(rth, family); +} + +int rtnl_linkdump_req_filter_fn(struct rtnl_handle *rth, int family, + req_filter_fn_t filter_fn) +{ + if (family == AF_UNSPEC) { + struct { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + char buf[1024]; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nlh.nlmsg_type = RTM_GETLINK, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .nlh.nlmsg_seq = rth->dump = ++rth->seq, + .ifm.ifi_family = family, + }; + int err; + + if (!filter_fn) + return -EINVAL; + + err = filter_fn(&req.nlh, sizeof(req)); + if (err) + return err; + + return send(rth->fd, &req, req.nlh.nlmsg_len, 0); + } + + return __rtnl_linkdump_req(rth, family); } int rtnl_statsdump_req_filter(struct rtnl_handle *rth, int fam, __u32 filt_mask) @@ -512,6 +615,10 @@ static int rtnl_dump_done(struct nlmsghdr *h) } if (len < 0) { + /* check for any messages returned from kernel */ + if (nl_dump_ext_ack_done(h, len)) + return len; + errno = -len; switch (errno) { case ENOENT: diff --git a/lib/utils.c b/lib/utils.c index 84733890..a81c0700 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -600,18 +600,6 @@ static int __get_addr_1(inet_prefix *addr, const char *name, int family) return 0; } - if (family == AF_DECnet) { - struct dn_naddr dna; - - addr->family = AF_DECnet; - if (dnet_pton(AF_DECnet, name, &dna) <= 0) - return -1; - memcpy(addr->data, dna.a_addr, 2); - addr->bytelen = 2; - addr->bitlen = -1; - return 0; - } - if (family == AF_MPLS) { unsigned int maxlabels; int i; @@ -1000,15 +988,6 @@ const char *rt_addr_n2a_r(int af, int len, return inet_ntop(af, addr, buf, buflen); case AF_MPLS: return mpls_ntop(af, addr, buf, buflen); - case AF_IPX: - return ipx_ntop(af, addr, buf, buflen); - case AF_DECnet: - { - struct dn_naddr dna = { 2, { 0, 0, } }; - - memcpy(dna.a_addr, addr, 2); - return dnet_ntop(af, &dna, buf, buflen); - } case AF_PACKET: return ll_addr_n2a(addr, len, ARPHRD_VOID, buf, buflen); case AF_BRIDGE: @@ -1050,8 +1029,6 @@ int read_family(const char *name) family = AF_INET; else if (strcmp(name, "inet6") == 0) family = AF_INET6; - else if (strcmp(name, "dnet") == 0) - family = AF_DECnet; else if (strcmp(name, "link") == 0) family = AF_PACKET; else if (strcmp(name, "ipx") == 0) @@ -1069,8 +1046,6 @@ const char *family_name(int family) return "inet"; if (family == AF_INET6) return "inet6"; - if (family == AF_DECnet) - return "dnet"; if (family == AF_PACKET) return "link"; if (family == AF_IPX) diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 5132f514..73d37c19 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -496,6 +496,8 @@ the following additional arguments are supported: ] [ .BI tos " TOS " ] [ +.BI df " DF " +] [ .BI flowlabel " FLOWLABEL " ] [ .BI dstport " PORT " @@ -565,6 +567,18 @@ parameter. .BI tos " TOS" - specifies the TOS value to use in outgoing packets. +.sp +.BI df " DF" +- specifies the usage of the Don't Fragment flag (DF) bit in outgoing packets +with IPv4 headers. The value +.B inherit +causes the bit to be copied from the original IP header. The values +.B unset +and +.B set +cause the bit to be always unset or always set, respectively. By default, the +bit is not set. + .sp .BI flowlabel " FLOWLABEL" - specifies the flow label to use in outgoing packets. @@ -1166,6 +1180,8 @@ the following additional arguments are supported: ] [ .BI tos " TOS " ] [ +.BI df " DF " +] [ .BI flowlabel " FLOWLABEL " ] [ .BI dstport " PORT" @@ -1198,6 +1214,18 @@ ttl. Default option is "0". .BI tos " TOS" - specifies the TOS value to use in outgoing packets. +.sp +.BI df " DF" +- specifies the usage of the Don't Fragment flag (DF) bit in outgoing packets +with IPv4 headers. The value +.B inherit +causes the bit to be copied from the original IP header. The values +.B unset +and +.B set +cause the bit to be always unset or always set, respectively. By default, the +bit is not set. + .sp .BI flowlabel " FLOWLABEL" - specifies the flow label to use in outgoing packets. diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in index 26dfe0b0..9603ac6e 100644 --- a/man/man8/ip-route.8.in +++ b/man/man8/ip-route.8.in @@ -107,7 +107,7 @@ replace " } " .ti -8 .IR FAMILY " := [ " -.BR inet " | " inet6 " | " ipx " | " dnet " | " mpls " | " bridge " | " link " ]" +.BR inet " | " inet6 " | " mpls " | " bridge " | " link " ]" .ti -8 .IR OPTIONS " := " FLAGS " [ " diff --git a/man/man8/ip.8 b/man/man8/ip.8 index 1d358879..84ade110 100644 --- a/man/man8/ip.8 +++ b/man/man8/ip.8 @@ -34,7 +34,7 @@ ip \- show / manipulate routing, network devices, interfaces and tunnels \fB\-r\fR[\fIesolve\fR] | \fB\-iec\fR | \fB\-f\fR[\fIamily\fR] { -.BR inet " | " inet6 " | " ipx " | " dnet " | " link " } | " +.BR inet " | " inet6 " | " link " } | " \fB-4\fR | \fB-6\fR | \fB-I\fR | @@ -94,7 +94,7 @@ Zero (0) means loop until all addresses are removed. .TP .BR "\-f" , " \-family " Specifies the protocol family to use. The protocol family identifier can be one of -.BR "inet" , " inet6" , " bridge" , " ipx" , " dnet" , " mpls" +.BR "inet" , " inet6" , " bridge" , " mpls" or .BR link . If this option is not present, @@ -125,16 +125,6 @@ shortcut for shortcut for .BR "\-family bridge" . -.TP -.B \-D -shortcut for -.BR "\-family decnet" . - -.TP -.B \-I -shortcut for -.BR "\-family ipx" . - .TP .B \-M shortcut for diff --git a/man/man8/rdma-dev.8 b/man/man8/rdma-dev.8 index b7abfe10..069f4717 100644 --- a/man/man8/rdma-dev.8 +++ b/man/man8/rdma-dev.8 @@ -1,6 +1,6 @@ .TH RDMA\-DEV 8 "06 Jul 2017" "iproute2" "Linux" .SH NAME -rdmak-dev \- RDMA device configuration +rdma-dev \- RDMA device configuration .SH SYNOPSIS .sp .ad l @@ -22,10 +22,18 @@ rdmak-dev \- RDMA device configuration .B rdma dev show .RI "[ " DEV " ]" +.ti -8 +.B rdma dev set +.RI "[ " DEV " ]" +.BR name +.BR NEWNAME + .ti -8 .B rdma dev help .SH "DESCRIPTION" +.SS rdma dev set - rename rdma device + .SS rdma dev show - display rdma device attributes .PP @@ -45,6 +53,11 @@ rdma dev show mlx5_3 Shows the state of specified RDMA device. .RE .PP +rdma dev set mlx5_3 name rdma_0 +.RS 4 +Renames the mlx5_3 device to rdma_0. +.RE +.PP .SH SEE ALSO .BR rdma (8), diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8 index 8be88825..adff41e3 100644 --- a/man/man8/tc-flower.8 +++ b/man/man8/tc-flower.8 @@ -56,8 +56,9 @@ flower \- flow based traffic control filter .IR MASKED_IP_TTL " | { " .BR dst_ip " | " src_ip " } " .IR PREFIX " | { " -.BR dst_port " | " src_port " } " -.IR port_number " } | " +.BR dst_port " | " src_port " } { " +.IR port_number " | " +.IR min_port_number-max_port_number " } | " .B tcp_flags .IR MASKED_TCP_FLAGS " | " .B type @@ -220,10 +221,12 @@ must be a valid IPv4 or IPv6 address, depending on the \fBprotocol\fR option to tc filter, optionally followed by a slash and the prefix length. If the prefix is missing, \fBtc\fR assumes a full-length host match. .TP -.BI dst_port " NUMBER" +.IR \fBdst_port " { " NUMBER " | " " MIN_VALUE-MAX_VALUE " } .TQ -.BI src_port " NUMBER" -Match on layer 4 protocol source or destination port number. Only available for +.IR \fBsrc_port " { " NUMBER " | " " MIN_VALUE-MAX_VALUE " } +Match on layer 4 protocol source or destination port number. Alternatively, the +mininum and maximum values can be specified to match on a range of layer 4 +protocol source or destination port numbers. Only available for .BR ip_proto " values " udp ", " tcp " and " sctp which have to be specified in beforehand. .TP diff --git a/man/man8/tc-fq.8 b/man/man8/tc-fq.8 index f058a05a..1febe62b 100644 --- a/man/man8/tc-fq.8 +++ b/man/man8/tc-fq.8 @@ -15,23 +15,28 @@ BYTES ] [ .B maxrate RATE ] [ .B buckets -NUMBER ] [ +NUMBER ] [ +.B orphan_mask +NUMBER ] [ .B pacing | .B nopacing -] +] [ +.B ce_threshold +TIME ] .SH DESCRIPTION FQ (Fair Queue) is a classless packet scheduler meant to be mostly used for locally generated traffic. It is designed to achieve per flow pacing. FQ does flow separation, and is able to respect pacing requirements set by TCP stack. All packets belonging to a socket are considered as a 'flow'. -For non local packets (router workload), packet rxhash is used as fallback. +For non local packets (router workload), packet hash is used as fallback. An application can specify a maximum pacing rate using the .B SO_MAX_PACING_RATE setsockopt call. This packet scheduler adds delay between packets to -respect rate limitation set by TCP stack. +respect rate limitation set on each socket. Note that after linux-4.20, linux adopted EDT (Earliest Departure Time) +and TCP directly sets the appropriate Departure Time for each skb. Dequeueing happens in a round-robin fashion. A special FIFO queue is reserved for high priority packets ( @@ -72,18 +77,28 @@ is ignored only if it is larger than this value. The size of the hash table used for flow lookups. Each bucket is assigned a red-black tree for efficient collision sorting. Default: 1024. +.SS orphan_mask +For packets not owned by a socket, fq is able to mask a part of skb->hash +and reduce number of buckets associated with the traffic. This is a DDOS +prevention mechanism, and the default is 1023 (meaning no more than 1024 flows +are allocated for these packets) .SS [no]pacing Enable or disable flow pacing. Default is enabled. +.SS ce_threshold +sets a threshold above which all packets are marked with ECN Congestion +Experienced. This is useful for DCTCP-style congestion control algorithms that +require marking at very shallow queueing thresholds. + .SH EXAMPLES -#tc qdisc add dev eth0 root fq +#tc qdisc add dev eth0 root est 1sec 4sec fq ce_threshold 4ms .br -#tc -s -d qdisc +#tc -s -d qdisc sh dev eth0 .br -qdisc fq 8003: dev eth0 root refcnt 2 limit 10000p flow_limit 100p buckets 1024 quantum 3028 initial_quantum 15140 - Sent 503727981 bytes 1146972 pkt (dropped 0, overlimits 0 requeues 54452) - backlog 0b 0p requeues 54452 - 1289 flows (1289 inactive, 0 throttled) - 0 gc, 31 highprio, 27411 throttled +qdisc fq 800e: root refcnt 9 limit 10000p flow_limit 1000p buckets 1024 orphan_mask 1023 quantum 3028 initial_quantum 15140 low_rate_threshold 550Kbit refill_delay 40.0ms ce_threshold 4.0ms + Sent 533368436185 bytes 352296695 pkt (dropped 0, overlimits 0 requeues 1339864) + rate 39220Mbit 3238202pps backlog 12417828b 358p requeues 1339864 + 1052 flows (852 inactive, 0 throttled) + 112 gc, 0 highprio, 212 throttled, 21501 ns latency, 470241 ce_mark .br .SH SEE ALSO .BR tc (8), diff --git a/misc/arpd.c b/misc/arpd.c index ce7c0997..504961cb 100644 --- a/misc/arpd.c +++ b/misc/arpd.c @@ -424,7 +424,7 @@ static int do_one_request(struct nlmsghdr *n) static void load_initial_table(void) { - if (rtnl_neighdump_req(&rth, AF_INET) < 0) { + if (rtnl_neighdump_req(&rth, AF_INET, NULL) < 0) { perror("dump request failed"); exit(1); } diff --git a/netem/README.distribution b/netem/README.distribution index 23f7ecb7..6d527854 100644 --- a/netem/README.distribution +++ b/netem/README.distribution @@ -1,4 +1,4 @@ -Notes about distribution tables from Nistnet +Notes about distribution tables from Nistnet ------------------------------------------------------------------------------- I. About the distribution tables diff --git a/rdma/dev.c b/rdma/dev.c index 7738a6cf..60ff4b31 100644 --- a/rdma/dev.c +++ b/rdma/dev.c @@ -14,6 +14,7 @@ static int dev_help(struct rd *rd) { pr_out("Usage: %s dev show [DEV]\n", rd->filename); + pr_out(" %s dev set [DEV] name DEVNAME\n", rd->filename); return 0; } @@ -258,17 +259,51 @@ static int dev_one_show(struct rd *rd) return rd_exec_cmd(rd, cmds, "parameter"); } +static int dev_set_name(struct rd *rd) +{ + uint32_t seq; + + if (rd_no_arg(rd)) { + pr_err("Please provide device new name.\n"); + return -EINVAL; + } + + rd_prepare_msg(rd, RDMA_NLDEV_CMD_SET, + &seq, (NLM_F_REQUEST | NLM_F_ACK)); + mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx); + mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd_argv(rd)); + + return rd_send_msg(rd); +} + +static int dev_one_set(struct rd *rd) +{ + const struct rd_cmd cmds[] = { + { NULL, dev_help}, + { "name", dev_set_name}, + { 0 } + }; + + return rd_exec_cmd(rd, cmds, "parameter"); +} + static int dev_show(struct rd *rd) { return rd_exec_dev(rd, dev_one_show); } +static int dev_set(struct rd *rd) +{ + return rd_exec_require_dev(rd, dev_one_set); +} + int cmd_dev(struct rd *rd) { const struct rd_cmd cmds[] = { { NULL, dev_show }, { "show", dev_show }, { "list", dev_show }, + { "set", dev_set }, { "help", dev_help }, { 0 } }; diff --git a/rdma/link.c b/rdma/link.c index 7a6d4b7e..c064be62 100644 --- a/rdma/link.c +++ b/rdma/link.c @@ -19,7 +19,7 @@ static int link_help(struct rd *rd) static const char *caps_to_str(uint32_t idx) { -#define RDMA_PORT_FLAGS(x) \ +#define RDMA_PORT_FLAGS_LOW(x) \ x(RESERVED, 0) \ x(SM, 1) \ x(NOTICE, 2) \ @@ -53,13 +53,39 @@ static const char *caps_to_str(uint32_t idx) x(MULT_FDB, 30) \ x(HIERARCHY_INFO, 31) - enum { RDMA_PORT_FLAGS(RDMA_BITMAP_ENUM) }; +#define RDMA_PORT_FLAGS_HIGH(x) \ + x(SET_NODE_DESC, 0) \ + x(EXT_INFO, 1) \ + x(VIRT, 2) \ + x(SWITCH_POR_STATE_TABLE, 3) \ + x(LINK_WIDTH_2X, 4) \ + x(LINK_SPEED_HDR, 5) + + /* + * Separation below is needed to allow compilation of rdmatool + * on 32bits systems. On such systems, C-enum is limited to be + * int and can't hold more than 32 bits. + */ + enum { RDMA_PORT_FLAGS_LOW(RDMA_BITMAP_ENUM) }; + enum { RDMA_PORT_FLAGS_HIGH(RDMA_BITMAP_ENUM) }; static const char * const - rdma_port_names[] = { RDMA_PORT_FLAGS(RDMA_BITMAP_NAMES) }; - #undef RDMA_PORT_FLAGS + rdma_port_names_low[] = { RDMA_PORT_FLAGS_LOW(RDMA_BITMAP_NAMES) }; + static const char * const + rdma_port_names_high[] = { RDMA_PORT_FLAGS_HIGH(RDMA_BITMAP_NAMES) }; + uint32_t high_idx; + #undef RDMA_PORT_FLAGS_LOW + #undef RDMA_PORT_FLAGS_HIGH - return rdma_port_names[idx]; + if (idx < ARRAY_SIZE(rdma_port_names_low) && rdma_port_names_low[idx]) + return rdma_port_names_low[idx]; + + high_idx = idx - ARRAY_SIZE(rdma_port_names_low); + if (high_idx < ARRAY_SIZE(rdma_port_names_high) && + rdma_port_names_high[high_idx]) + return rdma_port_names_high[high_idx]; + + return "UNKNOWN"; } static void link_print_caps(struct rd *rd, struct nlattr **tb) diff --git a/rdma/rdma.h b/rdma/rdma.h index 05c3c69b..547bb574 100644 --- a/rdma/rdma.h +++ b/rdma/rdma.h @@ -74,6 +74,13 @@ struct rd_cmd { int (*func)(struct rd *rd); }; +/* + * Parser interface + */ +bool rd_no_arg(struct rd *rd); +void rd_arg_inc(struct rd *rd); + +char *rd_argv(struct rd *rd); /* * Commands interface @@ -83,11 +90,14 @@ int cmd_link(struct rd *rd); int cmd_res(struct rd *rd); int rd_exec_cmd(struct rd *rd, const struct rd_cmd *c, const char *str); int rd_exec_dev(struct rd *rd, int (*cb)(struct rd *rd)); +int rd_exec_require_dev(struct rd *rd, int (*cb)(struct rd *rd)); int rd_exec_link(struct rd *rd, int (*cb)(struct rd *rd), bool strict_port); void rd_free(struct rd *rd); int rd_set_arg_to_devname(struct rd *rd); int rd_argc(struct rd *rd); +int strcmpx(const char *str1, const char *str2); + /* * Device manipulation */ @@ -108,12 +118,14 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t seq); void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t flags); int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data); int rd_attr_cb(const struct nlattr *attr, void *data); +int rd_attr_check(const struct nlattr *attr, int *typep); /* * Print helpers */ void print_driver_table(struct rd *rd, struct nlattr *tb); void newline(struct rd *rd); +void newline_indent(struct rd *rd); #define MAX_LINE_LENGTH 80 #endif /* _RDMA_TOOL_H_ */ diff --git a/rdma/utils.c b/rdma/utils.c index c7023367..069d44fe 100644 --- a/rdma/utils.c +++ b/rdma/utils.c @@ -18,14 +18,14 @@ int rd_argc(struct rd *rd) return rd->argc; } -static char *rd_argv(struct rd *rd) +char *rd_argv(struct rd *rd) { if (!rd_argc(rd)) return NULL; return *rd->argv; } -static int strcmpx(const char *str1, const char *str2) +int strcmpx(const char *str1, const char *str2) { if (strlen(str1) > strlen(str2)) return -1; @@ -39,7 +39,7 @@ static bool rd_argv_match(struct rd *rd, const char *pattern) return strcmpx(rd_argv(rd), pattern) == 0; } -static void rd_arg_inc(struct rd *rd) +void rd_arg_inc(struct rd *rd) { if (!rd_argc(rd)) return; @@ -47,7 +47,7 @@ static void rd_arg_inc(struct rd *rd) rd->argv++; } -static bool rd_no_arg(struct rd *rd) +bool rd_no_arg(struct rd *rd) { return rd_argc(rd) == 0; } @@ -404,7 +404,7 @@ static const enum mnl_attr_data_type nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DRIVER_U64] = MNL_TYPE_U64, }; -static int rd_attr_check(const struct nlattr *attr, int *typep) +int rd_attr_check(const struct nlattr *attr, int *typep) { int type; @@ -577,6 +577,16 @@ out: return ret; } +int rd_exec_require_dev(struct rd *rd, int (*cb)(struct rd *rd)) +{ + if (rd_no_arg(rd)) { + pr_err("Please provide device name.\n"); + return -EINVAL; + } + + return rd_exec_dev(rd, cb); +} + int rd_exec_cmd(struct rd *rd, const struct rd_cmd *cmds, const char *str) { const struct rd_cmd *c; @@ -696,7 +706,7 @@ void newline(struct rd *rd) pr_out("\n"); } -static void newline_indent(struct rd *rd) +void newline_indent(struct rd *rd) { newline(rd); if (!rd->json_output) diff --git a/tc/f_flower.c b/tc/f_flower.c index 65fca043..c5636667 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -473,24 +473,57 @@ static int flower_port_attr_type(__u8 ip_proto, enum flower_endpoint endpoint) return -1; } +static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint type, + __be16 *min_port_type, + __be16 *max_port_type) +{ + if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP || + ip_proto == IPPROTO_SCTP) { + if (type == FLOWER_ENDPOINT_SRC) { + *min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN; + *max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX; + } else { + *min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN; + *max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX; + } + } else { + return -1; + } + return 0; +} + static int flower_parse_port(char *str, __u8 ip_proto, enum flower_endpoint endpoint, struct nlmsghdr *n) { + __u16 min, max; int ret; - int type; - __be16 port; - type = flower_port_attr_type(ip_proto, endpoint); - if (type < 0) + ret = sscanf(str, "%hu-%hu", &min, &max); + + if (ret == 1) { + int type; + + type = flower_port_attr_type(ip_proto, endpoint); + if (type < 0) + return -1; + addattr16(n, MAX_MSG, type, htons(min)); + } else if (ret == 2) { + __be16 min_port_type, max_port_type; + + if (max <= min) { + fprintf(stderr, "max value should be greater than min value\n"); + return -1; + } + if (flower_port_range_attr_type(ip_proto, endpoint, + &min_port_type, &max_port_type)) + return -1; + + addattr16(n, MAX_MSG, min_port_type, htons(min)); + addattr16(n, MAX_MSG, max_port_type, htons(max)); + } else { return -1; - - ret = get_be16(&port, str, 10); - if (ret) - return -1; - - addattr16(n, MAX_MSG, type, port); - + } return 0; } @@ -1490,6 +1523,29 @@ static void flower_print_port(char *name, struct rtattr *attr) print_hu(PRINT_ANY, name, namefrm, rta_getattr_be16(attr)); } +static void flower_print_port_range(char *name, struct rtattr *min_attr, + struct rtattr *max_attr) +{ + if (!min_attr || !max_attr) + return; + + if (is_json_context()) { + open_json_object(name); + print_hu(PRINT_JSON, "start", NULL, rta_getattr_be16(min_attr)); + print_hu(PRINT_JSON, "end", NULL, rta_getattr_be16(max_attr)); + close_json_object(); + } else { + SPRINT_BUF(namefrm); + SPRINT_BUF(out); + size_t done; + + done = sprintf(out, "%u", rta_getattr_be16(min_attr)); + sprintf(out + done, "-%u", rta_getattr_be16(max_attr)); + sprintf(namefrm, "\n %s %%s", name); + print_string(PRINT_ANY, name, namefrm, out); + } +} + static void flower_print_tcp_flags(const char *name, struct rtattr *flags_attr, struct rtattr *mask_attr) { @@ -1678,6 +1734,7 @@ static int flower_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle) { struct rtattr *tb[TCA_FLOWER_MAX + 1]; + __be16 min_port_type, max_port_type; int nl_type, nl_mask_type; __be16 eth_type = 0; __u8 ip_proto = 0xff; @@ -1796,6 +1853,16 @@ static int flower_print_opt(struct filter_util *qu, FILE *f, if (nl_type >= 0) flower_print_port("src_port", tb[nl_type]); + if (!flower_port_range_attr_type(ip_proto, FLOWER_ENDPOINT_DST, + &min_port_type, &max_port_type)) + flower_print_port_range("dst_port", + tb[min_port_type], tb[max_port_type]); + + if (!flower_port_range_attr_type(ip_proto, FLOWER_ENDPOINT_SRC, + &min_port_type, &max_port_type)) + flower_print_port_range("src_port", + tb[min_port_type], tb[max_port_type]); + flower_print_tcp_flags("tcp_flags", tb[TCA_FLOWER_KEY_TCP_FLAGS], tb[TCA_FLOWER_KEY_TCP_FLAGS_MASK]); diff --git a/tc/q_choke.c b/tc/q_choke.c index b269b133..1353c80c 100644 --- a/tc/q_choke.c +++ b/tc/q_choke.c @@ -188,8 +188,7 @@ static int choke_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) fprintf(f, "limit %up min %up max %up ", qopt->limit, qopt->qth_min, qopt->qth_max); - if (qopt->flags & TC_RED_ECN) - fprintf(f, "ecn "); + tc_red_print_flags(qopt->flags); if (show_details) { fprintf(f, "ewma %u ", qopt->Wlog); diff --git a/tc/q_fq.c b/tc/q_fq.c index f3dbf2ba..a4174380 100644 --- a/tc/q_fq.c +++ b/tc/q_fq.c @@ -56,6 +56,7 @@ static void explain(void) fprintf(stderr, " [ [no]pacing ] [ refill_delay TIME ]\n"); fprintf(stderr, " [ low_rate_threshold RATE ]\n"); fprintf(stderr, " [ orphan_mask MASK]\n"); + fprintf(stderr, " [ ce_threshold TIME ]\n"); } static unsigned int ilog2(unsigned int val) @@ -83,6 +84,7 @@ static int fq_parse_opt(struct qdisc_util *qu, int argc, char **argv, unsigned int defrate; unsigned int refill_delay; unsigned int orphan_mask; + unsigned int ce_threshold; bool set_plimit = false; bool set_flow_plimit = false; bool set_quantum = false; @@ -92,6 +94,7 @@ static int fq_parse_opt(struct qdisc_util *qu, int argc, char **argv, bool set_refill_delay = false; bool set_orphan_mask = false; bool set_low_rate_threshold = false; + bool set_ce_threshold = false; int pacing = -1; struct rtattr *tail; @@ -135,6 +138,13 @@ static int fq_parse_opt(struct qdisc_util *qu, int argc, char **argv, return -1; } set_low_rate_threshold = true; + } else if (strcmp(*argv, "ce_threshold") == 0) { + NEXT_ARG(); + if (get_time(&ce_threshold, *argv)) { + fprintf(stderr, "Illegal \"ce_threshold\"\n"); + return -1; + } + set_ce_threshold = true; } else if (strcmp(*argv, "defrate") == 0) { NEXT_ARG(); if (strchr(*argv, '%')) { @@ -226,6 +236,9 @@ static int fq_parse_opt(struct qdisc_util *qu, int argc, char **argv, if (set_orphan_mask) addattr_l(n, 1024, TCA_FQ_ORPHAN_MASK, &orphan_mask, sizeof(refill_delay)); + if (set_ce_threshold) + addattr_l(n, 1024, TCA_FQ_CE_THRESHOLD, + &ce_threshold, sizeof(ce_threshold)); addattr_nest_end(n, tail); return 0; } @@ -239,6 +252,7 @@ static int fq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) unsigned int rate, quantum; unsigned int refill_delay; unsigned int orphan_mask; + unsigned int ce_threshold; SPRINT_BUF(b1); @@ -310,21 +324,28 @@ static int fq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) fprintf(f, "refill_delay %s ", sprint_time(refill_delay, b1)); } + if (tb[TCA_FQ_CE_THRESHOLD] && + RTA_PAYLOAD(tb[TCA_FQ_CE_THRESHOLD]) >= sizeof(__u32)) { + ce_threshold = rta_getattr_u32(tb[TCA_FQ_CE_THRESHOLD]); + if (ce_threshold != ~0U) + fprintf(f, "ce_threshold %s ", sprint_time(ce_threshold, b1)); + } + return 0; } static int fq_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) { - struct tc_fq_qd_stats *st; + struct tc_fq_qd_stats *st, _st; if (xstats == NULL) return 0; - if (RTA_PAYLOAD(xstats) < sizeof(*st)) - return -1; + memset(&_st, 0, sizeof(_st)); + memcpy(&_st, RTA_DATA(xstats), min(RTA_PAYLOAD(xstats), sizeof(*st))); - st = RTA_DATA(xstats); + st = &_st; fprintf(f, " %u flows (%u inactive, %u throttled)", st->flows, st->inactive_flows, st->throttled_flows); @@ -343,6 +364,9 @@ static int fq_print_xstats(struct qdisc_util *qu, FILE *f, if (st->unthrottle_latency_ns) fprintf(f, ", %u ns latency", st->unthrottle_latency_ns); + if (st->ce_mark) + fprintf(f, ", %llu ce_mark", st->ce_mark); + if (st->flows_plimit) fprintf(f, ", %llu flows_plimit", st->flows_plimit); diff --git a/tc/q_gred.c b/tc/q_gred.c index e63fac72..e297b866 100644 --- a/tc/q_gred.c +++ b/tc/q_gred.c @@ -37,10 +37,10 @@ static void explain(void) { fprintf(stderr, "Usage: tc qdisc { add | replace | change } ... gred setup vqs NUMBER\n"); - fprintf(stderr, " default DEFAULT_VQ [ grio ] [ limit BYTES ]\n"); + fprintf(stderr, " default DEFAULT_VQ [ grio ] [ limit BYTES ] [ecn] [harddrop]\n"); fprintf(stderr, " tc qdisc change ... gred vq VQ [ prio VALUE ] limit BYTES\n"); fprintf(stderr, " min BYTES max BYTES avpkt BYTES [ burst PACKETS ]\n"); - fprintf(stderr, " [ probability PROBABILITY ] [ bandwidth KBPS ]\n"); + fprintf(stderr, " [ probability PROBABILITY ] [ bandwidth KBPS ] [ecn] [harddrop]\n"); } static int init_gred(struct qdisc_util *qu, int argc, char **argv, @@ -87,6 +87,10 @@ static int init_gred(struct qdisc_util *qu, int argc, char **argv, fprintf(stderr, "Illegal \"limit\"\n"); return -1; } + } else if (strcmp(*argv, "ecn") == 0) { + opt.flags |= TC_RED_ECN; + } else if (strcmp(*argv, "harddrop") == 0) { + opt.flags |= TC_RED_HARDDROP; } else if (strcmp(*argv, "help") == 0) { explain(); return -1; @@ -117,15 +121,16 @@ static int init_gred(struct qdisc_util *qu, int argc, char **argv, */ static int gred_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n, const char *dev) { + struct rtattr *tail, *entry, *vqs; int ok = 0; struct tc_gred_qopt opt = { 0 }; unsigned int burst = 0; unsigned int avpkt = 0; + unsigned int flags = 0; double probability = 0.02; unsigned int rate = 0; int parm; __u8 sbuf[256]; - struct rtattr *tail; __u32 max_P; opt.DP = MAX_DPs; @@ -208,6 +213,10 @@ static int gred_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct n return -1; } ok++; + } else if (strcmp(*argv, "ecn") == 0) { + flags |= TC_RED_ECN; + } else if (strcmp(*argv, "harddrop") == 0) { + flags |= TC_RED_HARDDROP; } else if (strcmp(*argv, "help") == 0) { explain(); return -1; @@ -261,22 +270,167 @@ static int gred_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct n addattr_l(n, 1024, TCA_GRED_STAB, sbuf, 256); max_P = probability * pow(2, 32); addattr32(n, 1024, TCA_GRED_MAX_P, max_P); + + vqs = addattr_nest(n, 1024, TCA_GRED_VQ_LIST); + entry = addattr_nest(n, 1024, TCA_GRED_VQ_ENTRY); + addattr32(n, 1024, TCA_GRED_VQ_DP, opt.DP); + addattr32(n, 1024, TCA_GRED_VQ_FLAGS, flags); + addattr_nest_end(n, entry); + addattr_nest_end(n, vqs); + addattr_nest_end(n, tail); return 0; } +struct tc_gred_info { + bool flags_present; + __u64 bytes; + __u32 packets; + __u32 backlog; + __u32 prob_drop; + __u32 prob_mark; + __u32 forced_drop; + __u32 forced_mark; + __u32 pdrop; + __u32 other; + __u32 flags; +}; + +static void +gred_parse_vqs(struct tc_gred_info *info, struct rtattr *vqs) +{ + int rem = RTA_PAYLOAD(vqs); + unsigned int offset = 0; + + while (rem > offset) { + struct rtattr *tb_entry[TCA_GRED_VQ_ENTRY_MAX + 1] = {}; + struct rtattr *tb[TCA_GRED_VQ_MAX + 1] = {}; + struct rtattr *entry; + unsigned int len; + unsigned int dp; + + entry = RTA_DATA(vqs) + offset; + + parse_rtattr(tb_entry, TCA_GRED_VQ_ENTRY_MAX, entry, + rem - offset); + len = RTA_LENGTH(RTA_PAYLOAD(entry)); + offset += len; + + if (!tb_entry[TCA_GRED_VQ_ENTRY]) { + fprintf(stderr, + "ERROR: Failed to parse Virtual Queue entry\n"); + continue; + } + + parse_rtattr_nested(tb, TCA_GRED_VQ_MAX, + tb_entry[TCA_GRED_VQ_ENTRY]); + + if (!tb[TCA_GRED_VQ_DP]) { + fprintf(stderr, + "ERROR: Virtual Queue without DP attribute\n"); + continue; + } + + dp = rta_getattr_u32(tb[TCA_GRED_VQ_DP]); + + if (tb[TCA_GRED_VQ_STAT_BYTES]) + info[dp].bytes = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_BYTES]); + if (tb[TCA_GRED_VQ_STAT_PACKETS]) + info[dp].packets = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_PACKETS]); + if (tb[TCA_GRED_VQ_STAT_BACKLOG]) + info[dp].backlog = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_BACKLOG]); + if (tb[TCA_GRED_VQ_STAT_PROB_DROP]) + info[dp].prob_drop = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_PROB_DROP]); + if (tb[TCA_GRED_VQ_STAT_PROB_MARK]) + info[dp].prob_mark = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_PROB_MARK]); + if (tb[TCA_GRED_VQ_STAT_FORCED_DROP]) + info[dp].forced_drop = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_FORCED_DROP]); + if (tb[TCA_GRED_VQ_STAT_FORCED_MARK]) + info[dp].forced_mark = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_FORCED_MARK]); + if (tb[TCA_GRED_VQ_STAT_PDROP]) + info[dp].pdrop = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_PDROP]); + if (tb[TCA_GRED_VQ_STAT_OTHER]) + info[dp].other = + rta_getattr_u32(tb[TCA_GRED_VQ_STAT_OTHER]); + info[dp].flags_present = !!tb[TCA_GRED_VQ_FLAGS]; + if (tb[TCA_GRED_VQ_FLAGS]) + info[dp].flags = + rta_getattr_u32(tb[TCA_GRED_VQ_FLAGS]); + } +} + +static void +gred_print_stats(struct tc_gred_info *info, struct tc_gred_qopt *qopt) +{ + __u64 bytes = info ? info->bytes : qopt->bytesin; + + SPRINT_BUF(b1); + + if (!is_json_context()) + printf("\n Queue size: "); + + print_uint(PRINT_JSON, "qave", NULL, qopt->qave); + print_string(PRINT_FP, NULL, "average %s ", + sprint_size(qopt->qave, b1)); + + print_uint(PRINT_JSON, "backlog", NULL, qopt->backlog); + print_string(PRINT_FP, NULL, "current %s ", + sprint_size(qopt->backlog, b1)); + + if (!is_json_context()) + printf("\n Dropped packets: "); + + if (info) { + print_uint(PRINT_ANY, "forced_drop", "forced %u ", + info->forced_drop); + print_uint(PRINT_ANY, "prob_drop", "early %u ", + info->prob_drop); + print_uint(PRINT_ANY, "pdrop", "pdrop %u ", info->pdrop); + print_uint(PRINT_ANY, "other", "other %u ", info->other); + + if (!is_json_context()) + printf("\n Marked packets: "); + print_uint(PRINT_ANY, "forced_mark", "forced %u ", + info->forced_mark); + print_uint(PRINT_ANY, "prob_mark", "early %u ", + info->prob_mark); + } else { + print_uint(PRINT_ANY, "forced_drop", "forced %u ", + qopt->forced); + print_uint(PRINT_ANY, "prob_drop", "early %u ", qopt->early); + print_uint(PRINT_ANY, "pdrop", "pdrop %u ", qopt->pdrop); + print_uint(PRINT_ANY, "other", "other %u ", qopt->other); + } + + if (!is_json_context()) + printf("\n Total packets: "); + + print_uint(PRINT_ANY, "packets", "%u ", qopt->packets); + + print_uint(PRINT_JSON, "bytes", NULL, bytes); + print_string(PRINT_FP, NULL, "(%s) ", sprint_size(bytes, b1)); +} + static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) { + struct tc_gred_info infos[MAX_DPs] = {}; struct rtattr *tb[TCA_GRED_MAX + 1]; struct tc_gred_sopt *sopt; struct tc_gred_qopt *qopt; + bool vq_info = false; __u32 *max_p = NULL; __u32 *limit = NULL; unsigned int i; SPRINT_BUF(b1); - SPRINT_BUF(b2); - SPRINT_BUF(b3); if (opt == NULL) return 0; @@ -302,47 +456,69 @@ static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) return -1; } -/* Bad hack! should really return a proper message as shown above*/ - - fprintf(f, "vqs %u default %u %s", - sopt->DPs, - sopt->def_DP, - sopt->grio ? "grio " : ""); - - if (limit) - fprintf(f, "limit %s ", - sprint_size(*limit, b1)); - - for (i = 0; i < MAX_DPs; i++, qopt++) { - if (qopt->DP >= MAX_DPs) continue; - fprintf(f, "\n vq %u prio %hhu limit %s min %s max %s ", - qopt->DP, - qopt->prio, - sprint_size(qopt->limit, b1), - sprint_size(qopt->qth_min, b2), - sprint_size(qopt->qth_max, b3)); - if (show_details) { - fprintf(f, "ewma %u ", qopt->Wlog); - if (max_p) - fprintf(f, "probability %lg ", max_p[i] / pow(2, 32)); - else - fprintf(f, "Plog %u ", qopt->Plog); - fprintf(f, "Scell_log %u ", qopt->Scell_log); - } - if (show_stats) { - fprintf(f, "\n Queue size: average %s current %s ", - sprint_size(qopt->qave, b1), - sprint_size(qopt->backlog, b2)); - fprintf(f, "\n Dropped packets: forced %u early %u pdrop %u other %u ", - qopt->forced, - qopt->early, - qopt->pdrop, - qopt->other); - fprintf(f, "\n Total packets: %u (%s) ", - qopt->packets, - sprint_size(qopt->bytesin, b1)); - } + if (tb[TCA_GRED_VQ_LIST]) { + gred_parse_vqs(infos, tb[TCA_GRED_VQ_LIST]); + vq_info = true; } + + print_uint(PRINT_ANY, "dp_cnt", "vqs %u ", sopt->DPs); + print_uint(PRINT_ANY, "dp_default", "default %u ", sopt->def_DP); + + if (sopt->grio) + print_bool(PRINT_ANY, "grio", "grio ", true); + else + print_bool(PRINT_ANY, "grio", NULL, false); + + if (limit) { + print_uint(PRINT_JSON, "limit", NULL, *limit); + print_string(PRINT_FP, NULL, "limit %s ", + sprint_size(*limit, b1)); + } + + tc_red_print_flags(sopt->flags); + + open_json_array(PRINT_JSON, "vqs"); + for (i = 0; i < MAX_DPs; i++, qopt++) { + if (qopt->DP >= MAX_DPs) + continue; + + open_json_object(NULL); + + print_uint(PRINT_ANY, "vq", "\n vq %u ", qopt->DP); + print_hhu(PRINT_ANY, "prio", "prio %hhu ", qopt->prio); + + print_uint(PRINT_JSON, "limit", NULL, qopt->limit); + print_string(PRINT_FP, NULL, "limit %s ", + sprint_size(qopt->limit, b1)); + + print_uint(PRINT_JSON, "min", NULL, qopt->qth_min); + print_string(PRINT_FP, NULL, "min %s ", + sprint_size(qopt->qth_min, b1)); + + print_uint(PRINT_JSON, "max", NULL, qopt->qth_max); + print_string(PRINT_FP, NULL, "max %s ", + sprint_size(qopt->qth_max, b1)); + + if (infos[i].flags_present) + tc_red_print_flags(infos[i].flags); + + if (show_details) { + print_uint(PRINT_ANY, "ewma", "ewma %u ", qopt->Wlog); + if (max_p) + print_float(PRINT_ANY, "probability", + "probability %lg ", + max_p[i] / pow(2, 32)); + else + print_uint(PRINT_ANY, "Plog", "Plog %u ", + qopt->Plog); + print_uint(PRINT_ANY, "Scell_log", "Scell_log %u ", + qopt->Scell_log); + } + if (show_stats) + gred_print_stats(vq_info ? &infos[i] : NULL, qopt); + close_json_object(); + } + close_json_array(PRINT_JSON, "vqs"); return 0; } diff --git a/tc/q_red.c b/tc/q_red.c index 49fd4ac8..3b3a1204 100644 --- a/tc/q_red.c +++ b/tc/q_red.c @@ -189,18 +189,8 @@ static int red_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) print_uint(PRINT_JSON, "max", NULL, qopt->qth_max); print_string(PRINT_FP, NULL, "max %s ", sprint_size(qopt->qth_max, b3)); - if (qopt->flags & TC_RED_ECN) - print_bool(PRINT_ANY, "ecn", "ecn ", true); - else - print_bool(PRINT_ANY, "ecn", NULL, false); - if (qopt->flags & TC_RED_HARDDROP) - print_bool(PRINT_ANY, "harddrop", "harddrop ", true); - else - print_bool(PRINT_ANY, "harddrop", NULL, false); - if (qopt->flags & TC_RED_ADAPTATIVE) - print_bool(PRINT_ANY, "adaptive", "adaptive ", true); - else - print_bool(PRINT_ANY, "adaptive", NULL, false); + tc_red_print_flags(qopt->flags); + if (show_details) { print_uint(PRINT_ANY, "ewma", "ewma %u ", qopt->Wlog); if (max_P) diff --git a/tc/q_sfq.c b/tc/q_sfq.c index 6a1d853b..eee31ec5 100644 --- a/tc/q_sfq.c +++ b/tc/q_sfq.c @@ -235,8 +235,7 @@ static int sfq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) sprint_size(qopt_ext->qth_min, b2), sprint_size(qopt_ext->qth_max, b3), qopt_ext->max_P / pow(2, 32)); - if (qopt_ext->flags & TC_RED_ECN) - fprintf(f, "ecn "); + tc_red_print_flags(qopt_ext->flags); if (show_stats) { fprintf(f, "\n prob_mark %u prob_mark_head %u prob_drop %u", qopt_ext->stats.prob_mark, diff --git a/tc/tc_red.c b/tc/tc_red.c index 178fe088..3ce3ca42 100644 --- a/tc/tc_red.c +++ b/tc/tc_red.c @@ -20,7 +20,9 @@ #include #include +#include "utils.h" #include "tc_core.h" +#include "tc_util.h" #include "tc_red.h" /* @@ -97,3 +99,21 @@ int tc_red_eval_idle_damping(int Wlog, unsigned int avpkt, unsigned int bps, __u sbuf[255] = 31; return clog; } + +void tc_red_print_flags(__u32 flags) +{ + if (flags & TC_RED_ECN) + print_bool(PRINT_ANY, "ecn", "ecn ", true); + else + print_bool(PRINT_ANY, "ecn", NULL, false); + + if (flags & TC_RED_HARDDROP) + print_bool(PRINT_ANY, "harddrop", "harddrop ", true); + else + print_bool(PRINT_ANY, "harddrop", NULL, false); + + if (flags & TC_RED_ADAPTATIVE) + print_bool(PRINT_ANY, "adaptive", "adaptive ", true); + else + print_bool(PRINT_ANY, "adaptive", NULL, false); +} diff --git a/tc/tc_red.h b/tc/tc_red.h index 6c6e6b03..3882c831 100644 --- a/tc/tc_red.h +++ b/tc/tc_red.h @@ -6,5 +6,6 @@ int tc_red_eval_P(unsigned qmin, unsigned qmax, double prob); int tc_red_eval_ewma(unsigned qmin, unsigned burst, unsigned avpkt); int tc_red_eval_idle_damping(int wlog, unsigned avpkt, unsigned bandwidth, __u8 *sbuf); +void tc_red_print_flags(__u32 flags); #endif diff --git a/testsuite/tests/tc/cls-testbed.t b/testsuite/tests/tc/cls-testbed.t deleted file mode 100755 index d5c21e5c..00000000 --- a/testsuite/tests/tc/cls-testbed.t +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash -# vim: ft=sh - -. lib/generic.sh - -QDISCS="cbq htb dsmark" - -if [ ! -d tests/cls ]; then - ts_log "tests/cls folder does not exist" - ts_skip -fi - -for q in ${QDISCS}; do - ts_log "Preparing classifier testbed with qdisc $q" - - for c in tests/cls/*.c; do - - case "$q" in - cbq) - ts_tc "cls-testbed" "cbq root qdisc creation" \ - qdisc add dev $DEV root handle 10:0 \ - cbq bandwidth 100Mbit avpkt 1400 mpu 64 - ts_tc "cls-testbed" "cbq root class creation" \ - class add dev $DEV parent 10:0 classid 10:12 \ - cbq bandwidth 100mbit rate 100mbit allot 1514 prio 3 \ - maxburst 1 avpkt 500 bounded - ;; - htb) - ts_qdisc_available "htb" - if [ $? -eq 0 ]; then - ts_log "cls-testbed: HTB is unsupported by $TC, skipping" - continue; - fi - ts_tc "cls-testbed" "htb root qdisc creation" \ - qdisc add dev $DEV root handle 10:0 htb - ts_tc "cls-testbed" "htb root class creation" \ - class add dev $DEV parent 10:0 classid 10:12 \ - htb rate 100Mbit quantum 1514 - ;; - dsmark) - ts_qdisc_available "dsmark" - if [ $? -eq 0 ]; then - ts_log "cls-testbed: dsmark is unsupported by $TC, skipping" - continue; - fi - ts_tc "cls-testbed" "dsmark root qdisc creation" \ - qdisc add dev $DEV root handle 20:0 \ - dsmark indices 64 default_index 1 set_tc_index - ts_tc "cls-testbed" "dsmark class creation" \ - class change dev $DEV parent 20:0 classid 20:12 \ - dsmark mask 0xff value 2 - ts_tc "cls-testbed" "prio inner qdisc creation" \ - qdisc add dev $DEV parent 20:0 handle 10:0 prio - ;; - *) - ts_err "cls-testbed: no testbed configuration found for qdisc $q" - continue - ;; - esac - - ts_tc "cls-testbed" "tree listing" qdisc list dev eth0 - ts_tc "cls-testbed" "tree class listing" class list dev eth0 - ts_log "cls-testbed: starting classifier test $c" - $c - - case "$q" in - *) - ts_tc "cls-testbed" "generic qdisc tree deletion" \ - qdisc del dev $DEV root - ;; - esac - done -done