From 4f59f4a5afbb14ee7556f18b0d0663c2db5c0416 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 15 May 2018 21:49:55 -0300 Subject: [PATCH 1/5] tc-netem: fix limit description in man page As the kernel code says, limit is actually the amount of packets it can hold queued at a time, as per: static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { ... if (unlikely(sch->q.qlen >= sch->limit)) return qdisc_drop_all(skb, sch, to_free); So lets fix the description of the field in the man page. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Stephen Hemminger --- man/man8/tc-netem.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/man8/tc-netem.8 b/man/man8/tc-netem.8 index b31384f5..f2cd86b6 100644 --- a/man/man8/tc-netem.8 +++ b/man/man8/tc-netem.8 @@ -65,7 +65,7 @@ netem has the following options: .SS limit packets -limits the effect of selected options to the indicated number of next packets. +maximum number of packets the qdisc may hold queued at a time. .SS delay adds the chosen delay to the packets outgoing to chosen network interface. The From 405e0c4ffe7a410b09201db42955089fb0033776 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 17 May 2018 16:20:50 -0700 Subject: [PATCH 2/5] tc: allow 0% for percent options Allowing 0% is sometimes useful for example in netem loss and drop or perhaps dropping all traffic in a HTB bin. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199745 Reported-by: stuartmarsden@gmail.com Fixes: 927e3cfb52b5 ("tc: B.W limits can now be specified in %.") Signed-off-by: Stephen Hemminger --- lib/utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/utils.c b/lib/utils.c index 7b2c6dd1..02ce6772 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -105,7 +105,7 @@ int parse_percent(double *val, const char *str) *val = strtod(str, &p) / 100.; if (*val == HUGE_VALF || *val == HUGE_VALL) return 1; - if (*val == 0.0 || (*p && strcmp(p, "%"))) + if (*p && strcmp(p, "%")) return -1; return 0; From e2f5ceccdab52d667873c84a744d8ba2c091fbac Mon Sep 17 00:00:00 2001 From: Pavel Maltsev Date: Fri, 18 May 2018 15:44:00 -0700 Subject: [PATCH 3/5] Allow to configure /var/run/netns directory Currently NETNS_RUN_DIR is hardcoded and refers to /var/run/netns. However, some systems (e.g. Android) doesn't have /var which results in error attempts to create network namespaces on these systems. This change makes NETNS_RUN_DIR configurable at build time by allowing to pass environment variable to make command. Also, this change makes /etc/netns directory configurable through NETNS_ETC_DIR environment variable. For example: ./configure && NETNS_RUN_DIR=/mnt/vendor/netns make Tested: verified that iproute2 with configuration mentioned above creates namespaces in /mnt/vendor/netns Signed-off-by: Pavel Maltsev Signed-off-by: Stephen Hemminger --- Makefile | 6 +++++- include/namespace.h | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b526d3b5..651d2a50 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,8 @@ PREFIX?=/usr LIBDIR?=$(PREFIX)/lib SBINDIR?=/sbin CONFDIR?=/etc/iproute2 +NETNS_RUN_DIR?=/var/run/netns +NETNS_ETC_DIR?=/etc/netns DATADIR?=$(PREFIX)/share HDRDIR?=$(PREFIX)/include/iproute2 DOCDIR?=$(DATADIR)/doc/iproute2 @@ -34,7 +36,9 @@ ifneq ($(SHARED_LIBS),y) DEFINES+= -DNO_SHARED_LIBS endif -DEFINES+=-DCONFDIR=\"$(CONFDIR)\" +DEFINES+=-DCONFDIR=\"$(CONFDIR)\" \ + -DNETNS_RUN_DIR=\"$(NETNS_RUN_DIR)\" \ + -DNETNS_ETC_DIR=\"$(NETNS_ETC_DIR)\" #options for decnet ADDLIB+=dnet_ntop.o dnet_pton.o diff --git a/include/namespace.h b/include/namespace.h index aed7ce08..e47f9b5d 100644 --- a/include/namespace.h +++ b/include/namespace.h @@ -8,8 +8,13 @@ #include #include +#ifndef NETNS_RUN_DIR #define NETNS_RUN_DIR "/var/run/netns" +#endif + +#ifndef NETNS_ETC_DIR #define NETNS_ETC_DIR "/etc/netns" +#endif #ifndef CLONE_NEWNET #define CLONE_NEWNET 0x40000000 /* New network namespace (lo, device, names sockets, etc) */ From 39d16a02d9637813b188ca65dcfe583f30e41755 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 23 May 2018 11:50:01 -0700 Subject: [PATCH 4/5] ip route: Print expires as signed int rta_expires is a signed int; print it as one. Fixes: 663c3cb23103f ("iproute: implement JSON and color output") Signed-off-by: David Ahern Signed-off-by: Stephen Hemminger --- ip/iproute.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ip/iproute.c b/ip/iproute.c index 56dd9f25..cbc43e2b 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -463,8 +463,8 @@ static void print_rta_cacheinfo(FILE *fp, const struct rta_cacheinfo *ci) hz = get_user_hz(); if (ci->rta_expires != 0) - print_uint(PRINT_ANY, "expires", - "expires %usec ", ci->rta_expires/hz); + print_int(PRINT_ANY, "expires", + "expires %dsec ", ci->rta_expires/hz); if (ci->rta_error != 0) print_uint(PRINT_ANY, "error", "error %u ", ci->rta_error); From 65083b5fe38b72032bd9fec4b14ed55f2de92688 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 25 May 2018 07:48:40 -0700 Subject: [PATCH 5/5] ip: defer lookup interface index The ip command would always lookup the network device index even when not necessary. This slows down operations like creating lots of VLAN's. David reported the original issue, this is an alternative patch that solves it in a slightly more general method. Using iproute2 to create a bridge and add 4094 vlans to it can take from 2 to 3 *minutes*. The reason is the extraneous call to ll_name_to_index. ll_name_to_index results in an ioctl(SIOCGIFINDEX) call which in turn invokes dev_load. If the index does not exist, which it won't when creating a new link, dev_load calls modprobe twice -- once for netdev-NAME and again for NAME. This is unnecessary overhead for each link create. When ip link is invoked for a new device, there is no reason to call ll_name_to_index for the new device. With this patch, creating a bridge and adding 4094 vlans takes less than 3 *seconds*. old: # time ip -batch ip-vlan.batch real 3m13.727s user 0m0.076s sys 0m1.959s new: # time ip -batch ip-vlan.batch real 0m3.222s user 0m0.044s sys 0m1.777s Reported-off-by: David Ahern Signed-off-by: Stephen Hemminger --- ip/ip_common.h | 4 ++-- ip/ipaddress.c | 10 ++++++++-- ip/iplink.c | 28 ++++++++++++++-------------- ip/iplink_xdp.c | 6 ++++-- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/ip/ip_common.h b/ip/ip_common.h index 1b89795c..49eb7d7b 100644 --- a/ip/ip_common.h +++ b/ip/ip_common.h @@ -36,7 +36,7 @@ int print_addrlabel(const struct sockaddr_nl *who, int print_neigh(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg); int ipaddr_list_link(int argc, char **argv); -void ipaddr_get_vf_rate(int, int *, int *, int); +void ipaddr_get_vf_rate(int, int *, int *, const char *); void iplink_usage(void) __attribute__((noreturn)); void iproute_reset_filter(int ifindex); @@ -145,7 +145,7 @@ int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp); void lwt_print_encap(FILE *fp, struct rtattr *encap_type, struct rtattr *encap); /* iplink_xdp.c */ -int xdp_parse(int *argc, char ***argv, struct iplink_req *req, __u32 ifindex, +int xdp_parse(int *argc, char ***argv, struct iplink_req *req, const char *ifname, bool generic, bool drv, bool offload); void xdp_dump(FILE *fp, struct rtattr *tb, bool link, bool details); diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 75539e05..00da14c6 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -1967,14 +1967,20 @@ ipaddr_loop_each_vf(struct rtattr *tb[], int vfnum, int *min, int *max) exit(1); } -void ipaddr_get_vf_rate(int vfnum, int *min, int *max, int idx) +void ipaddr_get_vf_rate(int vfnum, int *min, int *max, const char *dev) { struct nlmsg_chain linfo = { NULL, NULL}; struct rtattr *tb[IFLA_MAX+1]; struct ifinfomsg *ifi; struct nlmsg_list *l; struct nlmsghdr *n; - int len; + int idx, len; + + idx = ll_name_to_index(dev); + if (idx == 0) { + fprintf(stderr, "Device %s does not exist\n", dev); + exit(1); + } if (rtnl_wilddump_request(&rth, AF_UNSPEC, RTM_GETLINK) < 0) { perror("Cannot send dump request"); diff --git a/ip/iplink.c b/ip/iplink.c index 22afe022..9ff5f692 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -242,9 +242,10 @@ static int iplink_have_newlink(void) } #endif /* ! IPLINK_IOCTL_COMPAT */ -static int nl_get_ll_addr_len(unsigned int dev_index) +static int nl_get_ll_addr_len(const char *ifname) { int len; + int dev_index = ll_name_to_index(ifname); struct iplink_req req = { .n = { .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), @@ -259,6 +260,9 @@ static int nl_get_ll_addr_len(unsigned int dev_index) struct nlmsghdr *answer; struct rtattr *tb[IFLA_MAX+1]; + if (dev_index == 0) + return -1; + if (rtnl_talk(&rth, &req.n, &answer) < 0) return -1; @@ -337,7 +341,7 @@ static void iplink_parse_vf_vlan_info(int vf, int *argcp, char ***argvp, } static int iplink_parse_vf(int vf, int *argcp, char ***argvp, - struct iplink_req *req, int dev_index) + struct iplink_req *req, const char *dev) { char new_rate_api = 0, count = 0, override_legacy_rate = 0; struct ifla_vf_rate tivt; @@ -373,7 +377,7 @@ static int iplink_parse_vf(int vf, int *argcp, char ***argvp, NEXT_ARG(); if (matches(*argv, "mac") == 0) { struct ifla_vf_mac ivm = { 0 }; - int halen = nl_get_ll_addr_len(dev_index); + int halen = nl_get_ll_addr_len(dev); NEXT_ARG(); ivm.vf = vf; @@ -542,7 +546,7 @@ static int iplink_parse_vf(int vf, int *argcp, char ***argvp, int tmin, tmax; if (tivt.min_tx_rate == -1 || tivt.max_tx_rate == -1) { - ipaddr_get_vf_rate(tivt.vf, &tmin, &tmax, dev_index); + ipaddr_get_vf_rate(tivt.vf, &tmin, &tmax, dev); if (tivt.min_tx_rate == -1) tivt.min_tx_rate = tmin; if (tivt.max_tx_rate == -1) @@ -583,7 +587,6 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req, char **type) int vf = -1; int numtxqueues = -1; int numrxqueues = -1; - int dev_index = 0; int link_netnsid = -1; int index = 0; int group = -1; @@ -605,10 +608,8 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req, char **type) if (check_ifname(*argv)) invarg("\"name\" not a valid ifname", *argv); name = *argv; - if (!dev) { + if (!dev) dev = name; - dev_index = ll_name_to_index(dev); - } } else if (strcmp(*argv, "index") == 0) { NEXT_ARG(); if (index) @@ -660,7 +661,7 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req, char **type) bool offload = strcmp(*argv, "xdpoffload") == 0; NEXT_ARG(); - if (xdp_parse(&argc, &argv, req, dev_index, + if (xdp_parse(&argc, &argv, req, dev, generic, drv, offload)) exit(-1); @@ -750,10 +751,10 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req, char **type) vflist = addattr_nest(&req->n, sizeof(*req), IFLA_VFINFO_LIST); - if (dev_index == 0) + if (!dev) missarg("dev"); - len = iplink_parse_vf(vf, &argc, &argv, req, dev_index); + len = iplink_parse_vf(vf, &argc, &argv, req, dev); if (len < 0) return -1; addattr_nest_end(&req->n, vflist); @@ -916,7 +917,6 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req, char **type) if (check_ifname(*argv)) invarg("\"dev\" not a valid ifname", *argv); dev = *argv; - dev_index = ll_name_to_index(dev); } argc--; argv++; } @@ -931,8 +931,8 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req, char **type) else if (!strcmp(name, dev)) name = dev; - if (dev_index && addr_len) { - int halen = nl_get_ll_addr_len(dev_index); + if (dev && addr_len) { + int halen = nl_get_ll_addr_len(dev); if (halen >= 0 && halen != addr_len) { fprintf(stderr, diff --git a/ip/iplink_xdp.c b/ip/iplink_xdp.c index 83826358..dd4fd1fd 100644 --- a/ip/iplink_xdp.c +++ b/ip/iplink_xdp.c @@ -48,8 +48,8 @@ static int xdp_delete(struct xdp_req *xdp) return 0; } -int xdp_parse(int *argc, char ***argv, struct iplink_req *req, __u32 ifindex, - bool generic, bool drv, bool offload) +int xdp_parse(int *argc, char ***argv, struct iplink_req *req, + const char *ifname, bool generic, bool drv, bool offload) { struct bpf_cfg_in cfg = { .type = BPF_PROG_TYPE_XDP, @@ -61,6 +61,8 @@ int xdp_parse(int *argc, char ***argv, struct iplink_req *req, __u32 ifindex, }; if (offload) { + int ifindex = ll_name_to_index(ifname); + if (!ifindex) incomplete_command(); cfg.ifindex = ifindex;