From 043e03a3697e218ddb33e83523b15a1fc477b77b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 7 Jan 2021 17:23:26 +0200 Subject: [PATCH 01/32] nexthop: Fix usage output Before: # ip nexthop help Usage: ip nexthop { list | flush } [ protocol ID ] SELECTOR ip nexthop { add | replace } id ID NH [ protocol ID ] ip nexthop { get| del } id ID SELECTOR := [ id ID ] [ dev DEV ] [ vrf NAME ] [ master DEV ] [ groups ] [ fdb ] NH := { blackhole | [ via ADDRESS ] [ dev DEV ] [ onlink ] [ encap ENCAPTYPE ENCAPHDR ] | group GROUP ] } GROUP := [ id[,weight]>//... ] ENCAPTYPE := [ mpls ] ENCAPHDR := [ MPLSLABEL ] After: # ip nexthop help Usage: ip nexthop { list | flush } [ protocol ID ] SELECTOR ip nexthop { add | replace } id ID NH [ protocol ID ] ip nexthop { get | del } id ID SELECTOR := [ id ID ] [ dev DEV ] [ vrf NAME ] [ master DEV ] [ groups ] [ fdb ] NH := { blackhole | [ via ADDRESS ] [ dev DEV ] [ onlink ] [ encap ENCAPTYPE ENCAPHDR ] | group GROUP [ fdb ] } GROUP := [ //... ] ENCAPTYPE := [ mpls ] ENCAPHDR := [ MPLSLABEL ] Signed-off-by: Ido Schimmel Signed-off-by: David Ahern --- ip/ipnexthop.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ip/ipnexthop.c b/ip/ipnexthop.c index b7ffff77..20cde586 100644 --- a/ip/ipnexthop.c +++ b/ip/ipnexthop.c @@ -38,12 +38,12 @@ static void usage(void) fprintf(stderr, "Usage: ip nexthop { list | flush } [ protocol ID ] SELECTOR\n" " ip nexthop { add | replace } id ID NH [ protocol ID ]\n" - " ip nexthop { get| del } id ID\n" + " ip nexthop { get | del } id ID\n" "SELECTOR := [ id ID ] [ dev DEV ] [ vrf NAME ] [ master DEV ]\n" " [ groups ] [ fdb ]\n" "NH := { blackhole | [ via ADDRESS ] [ dev DEV ] [ onlink ]\n" - " [ encap ENCAPTYPE ENCAPHDR ] | group GROUP ] }\n" - "GROUP := [ id[,weight]>//... ]\n" + " [ encap ENCAPTYPE ENCAPHDR ] | group GROUP [ fdb ] }\n" + "GROUP := [ //... ]\n" "ENCAPTYPE := [ mpls ]\n" "ENCAPHDR := [ MPLSLABEL ]\n"); exit(-1); From 9bd498bfcd6b441e66b56bb299fdd62657f4dde4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 7 Jan 2021 17:23:27 +0200 Subject: [PATCH 02/32] ipmonitor: Mention "nexthop" object in help and man page Before: # ip monitor help Usage: ip monitor [ all | LISTofOBJECTS ] [ FILE ] [ label ] [all-nsid] [dev DEVICE] LISTofOBJECTS := link | address | route | mroute | prefix | neigh | netconf | rule | nsid FILE := file FILENAME After: # ip monitor help Usage: ip monitor [ all | LISTofOBJECTS ] [ FILE ] [ label ] [all-nsid] [dev DEVICE] LISTofOBJECTS := link | address | route | mroute | prefix | neigh | netconf | rule | nsid | nexthop FILE := file FILENAME Signed-off-by: Ido Schimmel Signed-off-by: David Ahern --- ip/ipmonitor.c | 2 +- man/man8/ip-monitor.8 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ip/ipmonitor.c b/ip/ipmonitor.c index 685be52c..99f5fda8 100644 --- a/ip/ipmonitor.c +++ b/ip/ipmonitor.c @@ -32,7 +32,7 @@ static void usage(void) fprintf(stderr, "Usage: ip monitor [ all | LISTofOBJECTS ] [ FILE ] [ label ] [all-nsid] [dev DEVICE]\n" "LISTofOBJECTS := link | address | route | mroute | prefix |\n" - " neigh | netconf | rule | nsid\n" + " neigh | netconf | rule | nsid | nexthop\n" "FILE := file FILENAME\n"); exit(-1); } diff --git a/man/man8/ip-monitor.8 b/man/man8/ip-monitor.8 index 86f8f988..f886d31b 100644 --- a/man/man8/ip-monitor.8 +++ b/man/man8/ip-monitor.8 @@ -55,7 +55,7 @@ command is the first in the command line and then the object list follows: is the list of object types that we want to monitor. It may contain .BR link ", " address ", " route ", " mroute ", " prefix ", " -.BR neigh ", " netconf ", " rule " and " nsid "." +.BR neigh ", " netconf ", " rule ", " nsid " and " nexthop "." If no .B file argument is given, From 537995c6d513dedb87c14e428c1034c0a297aa70 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Sun, 3 Jan 2021 08:17:06 +0200 Subject: [PATCH 03/32] rdma: Add support for the netlink extack Add support in rdma for extack errors to be received in userspace when sent from kernel, so now netlink extack error messages sent from kernel would be printed for the user. Signed-off-by: Patrisious Haddad Signed-off-by: Leon Romanovsky Signed-off-by: David Ahern --- rdma/rdma.h | 1 + rdma/utils.c | 24 ++++-------------------- 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/rdma/rdma.h b/rdma/rdma.h index fc8bcf09..470e11c8 100644 --- a/rdma/rdma.h +++ b/rdma/rdma.h @@ -19,6 +19,7 @@ #include "list.h" #include "utils.h" +#include "mnl_utils.h" #include "json_print.h" #define pr_err(args...) fprintf(stderr, ##args) diff --git a/rdma/utils.c b/rdma/utils.c index 2a201aa4..903a544c 100644 --- a/rdma/utils.c +++ b/rdma/utils.c @@ -666,18 +666,12 @@ int rd_send_msg(struct rd *rd) { int ret; - rd->nl = mnl_socket_open(NETLINK_RDMA); + rd->nl = mnlu_socket_open(NETLINK_RDMA); if (!rd->nl) { pr_err("Failed to open NETLINK_RDMA socket\n"); return -ENODEV; } - ret = mnl_socket_bind(rd->nl, 0, MNL_SOCKET_AUTOPID); - if (ret < 0) { - pr_err("Failed to bind socket with err %d\n", ret); - goto err; - } - ret = mnl_socket_sendto(rd->nl, rd->nlh, rd->nlh->nlmsg_len); if (ret < 0) { pr_err("Failed to send to socket with err %d\n", ret); @@ -692,23 +686,13 @@ err: int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, unsigned int seq) { - int ret; - unsigned int portid; char buf[MNL_SOCKET_BUFFER_SIZE]; + int ret; - portid = mnl_socket_get_portid(rd->nl); - do { - ret = mnl_socket_recvfrom(rd->nl, buf, sizeof(buf)); - if (ret <= 0) - break; - - ret = mnl_cb_run(buf, ret, seq, portid, callback, data); - } while (ret > 0); - + ret = mnlu_socket_recv_run(rd->nl, seq, buf, MNL_SOCKET_BUFFER_SIZE, + callback, data); if (ret < 0 && !rd->suppress_errors) perror("error"); - - mnl_socket_close(rd->nl); return ret; } From bf244ee6773486892e3223972afa07f5ab3b66bb Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 2 Jan 2021 01:03:35 +0100 Subject: [PATCH 04/32] lib: rt_names: Add rtnl_dsfield_get_name() For formatting DSCP (not full dsfield), it would be handy to be able to just get the name from the name table, and not get any of the remaining cruft related to formatting. Add a new entry point to just fetch the name table string uninterpreted. Use it from rtnl_dsfield_n2a(). Signed-off-by: Petr Machata Signed-off-by: David Ahern --- include/rt_names.h | 1 + lib/rt_names.c | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/rt_names.h b/include/rt_names.h index 990ed7f2..1835f3be 100644 --- a/include/rt_names.h +++ b/include/rt_names.h @@ -9,6 +9,7 @@ const char *rtnl_rtscope_n2a(int id, char *buf, int len); const char *rtnl_rttable_n2a(__u32 id, char *buf, int len); const char *rtnl_rtrealm_n2a(int id, char *buf, int len); const char *rtnl_dsfield_n2a(int id, char *buf, int len); +const char *rtnl_dsfield_get_name(int id); const char *rtnl_group_n2a(int id, char *buf, int len); int rtnl_rtprot_a2n(__u32 *id, const char *arg); diff --git a/lib/rt_names.c b/lib/rt_names.c index ca0680a1..b976471d 100644 --- a/lib/rt_names.c +++ b/lib/rt_names.c @@ -479,18 +479,30 @@ static void rtnl_rtdsfield_initialize(void) const char *rtnl_dsfield_n2a(int id, char *buf, int len) { + const char *name; + if (id < 0 || id >= 256) { snprintf(buf, len, "%d", id); return buf; } + if (!numeric) { + name = rtnl_dsfield_get_name(id); + if (name != NULL) + return name; + } + snprintf(buf, len, "0x%02x", id); + return buf; +} + +const char *rtnl_dsfield_get_name(int id) +{ + if (id < 0 || id >= 256) + return NULL; if (!rtnl_rtdsfield_tab[id]) { if (!rtnl_rtdsfield_init) rtnl_rtdsfield_initialize(); } - if (!numeric && rtnl_rtdsfield_tab[id]) - return rtnl_rtdsfield_tab[id]; - snprintf(buf, len, "0x%02x", id); - return buf; + return rtnl_rtdsfield_tab[id]; } From c13216f7a66ae470c19c450e5e5d1f44bbbb3dba Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 2 Jan 2021 01:03:36 +0100 Subject: [PATCH 05/32] lib: Generalize parse_mapping() The function parse_mapping() assumes the key is a number, with a single configurable exception, which is using "all" to mean "all possible keys". If a caller wishes to use symbolic names instead of numbers, they cannot reuse this function. To facilitate reuse in these situations, convert parse_mapping() into a helper, parse_mapping_gen(), which instead of an allow-all boolean takes a generic key-parsing callback. Rewrite parse_mapping() in terms of this newly-added helper and add a pair of key parsers, one for just numbers, another for numbers and the keyword "all". Publish the latter as well. Signed-off-by: Petr Machata Signed-off-by: David Ahern --- include/utils.h | 5 +++++ lib/utils.c | 37 +++++++++++++++++++++++++++++++------ 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/include/utils.h b/include/utils.h index 17043925..f1403f73 100644 --- a/include/utils.h +++ b/include/utils.h @@ -331,6 +331,11 @@ int parse_one_of(const char *msg, const char *realval, const char * const *list, size_t len, int *p_err); bool parse_on_off(const char *msg, const char *realval, int *p_err); +int parse_mapping_num_all(__u32 *keyp, const char *key); +int parse_mapping_gen(int *argcp, char ***argvp, + int (*key_cb)(__u32 *keyp, const char *key), + int (*mapping_cb)(__u32 key, char *value, void *data), + void *mapping_cb_data); int parse_mapping(int *argcp, char ***argvp, bool allow_all, int (*mapping_cb)(__u32 key, char *value, void *data), void *mapping_cb_data); diff --git a/lib/utils.c b/lib/utils.c index de875639..90e58fa3 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -1878,9 +1878,10 @@ bool parse_on_off(const char *msg, const char *realval, int *p_err) return parse_one_of(msg, realval, values_on_off, ARRAY_SIZE(values_on_off), p_err); } -int parse_mapping(int *argcp, char ***argvp, bool allow_all, - int (*mapping_cb)(__u32 key, char *value, void *data), - void *mapping_cb_data) +int parse_mapping_gen(int *argcp, char ***argvp, + int (*key_cb)(__u32 *keyp, const char *key), + int (*mapping_cb)(__u32 key, char *value, void *data), + void *mapping_cb_data) { int argc = *argcp; char **argv = *argvp; @@ -1894,9 +1895,7 @@ int parse_mapping(int *argcp, char ***argvp, bool allow_all, break; *colon = '\0'; - if (allow_all && matches(*argv, "all") == 0) { - key = (__u32) -1; - } else if (get_u32(&key, *argv, 0)) { + if (key_cb(&key, *argv)) { ret = 1; break; } @@ -1912,3 +1911,29 @@ int parse_mapping(int *argcp, char ***argvp, bool allow_all, *argvp = argv; return ret; } + +static int parse_mapping_num(__u32 *keyp, const char *key) +{ + return get_u32(keyp, key, 0); +} + +int parse_mapping_num_all(__u32 *keyp, const char *key) +{ + if (matches(key, "all") == 0) { + *keyp = (__u32) -1; + return 0; + } + return parse_mapping_num(keyp, key); +} + +int parse_mapping(int *argcp, char ***argvp, bool allow_all, + int (*mapping_cb)(__u32 key, char *value, void *data), + void *mapping_cb_data) +{ + if (allow_all) + return parse_mapping_gen(argcp, argvp, parse_mapping_num_all, + mapping_cb, mapping_cb_data); + else + return parse_mapping_gen(argcp, argvp, parse_mapping_num, + mapping_cb, mapping_cb_data); +} From 69290c32dca82368ab74f23aaca7866d506c7cde Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 2 Jan 2021 01:03:37 +0100 Subject: [PATCH 06/32] dcb: Generalize dcb_set_attribute() The function dcb_set_attribute() takes a fully-formed payload as an argument. For callers that need to build a nested attribute, such as is the case for DCB APP table, this is not great, because with libmnl, they would need to construct a separate netlink message just to pluck out the payload and hand it over to this function. Currently, dcb_set_attribute() also always wraps the payload in an DCB_ATTR_IEEE container, because that is what all the dcb subtools so far needed. But that is not appropriate for DCBX in particular, and in fact a handful other attributes, as well as any CEE payloads. Instead, generalize this code by adding parameters for constructing a custom payload and for fetching the response from a custom response attribute. Then add dcb_set_attribute_va(), which takes a callback to invoke in the right place for the nest to be built, and dcb_set_attribute_bare(), which is similar to dcb_set_attribute(), but does not encapsulate the payload in an IEEE container. Rewrite dcb_set_attribute() compatibly in terms of the new functions. Signed-off-by: Petr Machata Signed-off-by: David Ahern --- dcb/dcb.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++----- dcb/dcb.h | 7 ++++ 2 files changed, 98 insertions(+), 8 deletions(-) diff --git a/dcb/dcb.c b/dcb/dcb.c index 6640deef..9bbbbfa7 100644 --- a/dcb/dcb.c +++ b/dcb/dcb.c @@ -94,12 +94,17 @@ static int dcb_get_attribute_cb(const struct nlmsghdr *nlh, void *data) return mnl_attr_parse(nlh, sizeof(struct dcbmsg), dcb_get_attribute_attr_cb, data); } +struct dcb_set_attribute_response { + int response_attr; +}; + static int dcb_set_attribute_attr_cb(const struct nlattr *attr, void *data) { + struct dcb_set_attribute_response *resp = data; uint16_t len; uint8_t err; - if (mnl_attr_get_type(attr) != DCB_ATTR_IEEE) + if (mnl_attr_get_type(attr) != resp->response_attr) return MNL_CB_OK; len = mnl_attr_get_payload_len(attr); @@ -172,19 +177,23 @@ int dcb_get_attribute(struct dcb *dcb, const char *dev, int attr, void *data, si return 0; } -int dcb_set_attribute(struct dcb *dcb, const char *dev, int attr, const void *data, size_t data_len) +static int __dcb_set_attribute(struct dcb *dcb, int command, const char *dev, + int (*cb)(struct dcb *, struct nlmsghdr *, void *), + void *data, int response_attr) { + struct dcb_set_attribute_response resp = { + .response_attr = response_attr, + }; struct nlmsghdr *nlh; - struct nlattr *nest; int ret; - nlh = dcb_prepare(dcb, dev, RTM_SETDCB, DCB_CMD_IEEE_SET); + nlh = dcb_prepare(dcb, dev, RTM_SETDCB, command); - nest = mnl_attr_nest_start(nlh, DCB_ATTR_IEEE); - mnl_attr_put(nlh, attr, data_len, data); - mnl_attr_nest_end(nlh, nest); + ret = cb(dcb, nlh, data); + if (ret) + return ret; - ret = dcb_talk(dcb, nlh, dcb_set_attribute_cb, NULL); + ret = dcb_talk(dcb, nlh, dcb_set_attribute_cb, &resp); if (ret) { perror("Attribute write"); return ret; @@ -192,6 +201,80 @@ int dcb_set_attribute(struct dcb *dcb, const char *dev, int attr, const void *da return 0; } +struct dcb_set_attribute_ieee_cb { + int (*cb)(struct dcb *dcb, struct nlmsghdr *nlh, void *data); + void *data; +}; + +static int dcb_set_attribute_ieee_cb(struct dcb *dcb, struct nlmsghdr *nlh, void *data) +{ + struct dcb_set_attribute_ieee_cb *ieee_data = data; + struct nlattr *nest; + int ret; + + nest = mnl_attr_nest_start(nlh, DCB_ATTR_IEEE); + ret = ieee_data->cb(dcb, nlh, ieee_data->data); + if (ret) + return ret; + mnl_attr_nest_end(nlh, nest); + + return 0; +} + +int dcb_set_attribute_va(struct dcb *dcb, int command, const char *dev, + int (*cb)(struct dcb *dcb, struct nlmsghdr *nlh, void *data), + void *data) +{ + struct dcb_set_attribute_ieee_cb ieee_data = { + .cb = cb, + .data = data, + }; + + return __dcb_set_attribute(dcb, command, dev, + &dcb_set_attribute_ieee_cb, &ieee_data, + DCB_ATTR_IEEE); +} + +struct dcb_set_attribute { + int attr; + const void *data; + size_t data_len; +}; + +static int dcb_set_attribute_put(struct dcb *dcb, struct nlmsghdr *nlh, void *data) +{ + struct dcb_set_attribute *dsa = data; + + mnl_attr_put(nlh, dsa->attr, dsa->data_len, dsa->data); + return 0; +} + +int dcb_set_attribute(struct dcb *dcb, const char *dev, int attr, const void *data, size_t data_len) +{ + struct dcb_set_attribute dsa = { + .attr = attr, + .data = data, + .data_len = data_len, + }; + + return dcb_set_attribute_va(dcb, DCB_CMD_IEEE_SET, dev, + &dcb_set_attribute_put, &dsa); +} + +int dcb_set_attribute_bare(struct dcb *dcb, int command, const char *dev, + int attr, const void *data, size_t data_len, + int response_attr) +{ + struct dcb_set_attribute dsa = { + .attr = attr, + .data = data, + .data_len = data_len, + }; + + return __dcb_set_attribute(dcb, command, dev, + &dcb_set_attribute_put, &dsa, response_attr); +} + void dcb_print_array_u8(const __u8 *array, size_t size) { SPRINT_BUF(b); diff --git a/dcb/dcb.h b/dcb/dcb.h index 388a4204..da14937c 100644 --- a/dcb/dcb.h +++ b/dcb/dcb.h @@ -2,6 +2,7 @@ #ifndef __DCB_H__ #define __DCB_H__ 1 +#include #include #include @@ -32,6 +33,12 @@ int dcb_get_attribute(struct dcb *dcb, const char *dev, int attr, void *data, size_t data_len); int dcb_set_attribute(struct dcb *dcb, const char *dev, int attr, const void *data, size_t data_len); +int dcb_set_attribute_va(struct dcb *dcb, int command, const char *dev, + int (*cb)(struct dcb *dcb, struct nlmsghdr *nlh, void *data), + void *data); +int dcb_set_attribute_bare(struct dcb *dcb, int command, const char *dev, + int attr, const void *data, size_t data_len, + int response_attr); void dcb_print_named_array(const char *json_name, const char *fp_name, const __u8 *array, size_t size, From e59876ff556bc796eee7f6e68984a44e8f2d837b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 2 Jan 2021 01:03:38 +0100 Subject: [PATCH 07/32] dcb: Generalize dcb_get_attribute() The function dcb_get_attribute() assumes that the caller knows the exact size of the looked-for payload. It also assumes that the response comes wrapped in an DCB_ATTR_IEEE nest. The former assumption does not hold for the IEEE APP table, which has variable size. The latter one does not hold for DCBX, which is not IEEE-nested, and also for any CEE attributes, which would come CEE-nested. Factor out the payload extractor from the current dcb_get_attribute() code, and put into a helper. Then rewrite dcb_get_attribute() compatibly in terms of the new function. Introduce dcb_get_attribute_va() as a thin wrapper for IEEE-nested access, and dcb_get_attribute_bare() for access to attributes that are not nested. Signed-off-by: Petr Machata Signed-off-by: David Ahern --- dcb/dcb.c | 79 ++++++++++++++++++++++++++++++++++++++++++++----------- dcb/dcb.h | 4 +++ 2 files changed, 68 insertions(+), 15 deletions(-) diff --git a/dcb/dcb.c b/dcb/dcb.c index 9bbbbfa7..89f9b0ec 100644 --- a/dcb/dcb.c +++ b/dcb/dcb.c @@ -59,25 +59,19 @@ static void dcb_free(struct dcb *dcb) struct dcb_get_attribute { struct dcb *dcb; int attr; - void *data; - size_t data_len; + void *payload; + __u16 payload_len; }; static int dcb_get_attribute_attr_ieee_cb(const struct nlattr *attr, void *data) { struct dcb_get_attribute *ga = data; - uint16_t len; if (mnl_attr_get_type(attr) != ga->attr) return MNL_CB_OK; - len = mnl_attr_get_payload_len(attr); - if (len != ga->data_len) { - fprintf(stderr, "Wrong len %d, expected %zd\n", len, ga->data_len); - return MNL_CB_ERROR; - } - - memcpy(ga->data, mnl_attr_get_payload(attr), ga->data_len); + ga->payload = mnl_attr_get_payload(attr); + ga->payload_len = mnl_attr_get_payload_len(attr); return MNL_CB_STOP; } @@ -94,6 +88,16 @@ static int dcb_get_attribute_cb(const struct nlmsghdr *nlh, void *data) return mnl_attr_parse(nlh, sizeof(struct dcbmsg), dcb_get_attribute_attr_cb, data); } +static int dcb_get_attribute_bare_cb(const struct nlmsghdr *nlh, void *data) +{ + /* Bare attributes (e.g. DCB_ATTR_DCBX) are not wrapped inside an IEEE + * container, so this does not have to go through unpacking in + * dcb_get_attribute_attr_cb(). + */ + return mnl_attr_parse(nlh, sizeof(struct dcbmsg), + dcb_get_attribute_attr_ieee_cb, data); +} + struct dcb_set_attribute_response { int response_attr; }; @@ -155,25 +159,70 @@ static struct nlmsghdr *dcb_prepare(struct dcb *dcb, const char *dev, return nlh; } -int dcb_get_attribute(struct dcb *dcb, const char *dev, int attr, void *data, size_t data_len) +static int __dcb_get_attribute(struct dcb *dcb, int command, + const char *dev, int attr, + void **payload_p, __u16 *payload_len_p, + int (*get_attribute_cb)(const struct nlmsghdr *nlh, + void *data)) { struct dcb_get_attribute ga; struct nlmsghdr *nlh; int ret; - nlh = dcb_prepare(dcb, dev, RTM_GETDCB, DCB_CMD_IEEE_GET); + nlh = dcb_prepare(dcb, dev, RTM_GETDCB, command); ga = (struct dcb_get_attribute) { .dcb = dcb, .attr = attr, - .data = data, - .data_len = data_len, + .payload = NULL, }; - ret = dcb_talk(dcb, nlh, dcb_get_attribute_cb, &ga); + ret = dcb_talk(dcb, nlh, get_attribute_cb, &ga); if (ret) { perror("Attribute read"); return ret; } + if (ga.payload == NULL) { + perror("Attribute not found"); + return -ENOENT; + } + + *payload_p = ga.payload; + *payload_len_p = ga.payload_len; + return 0; +} + +int dcb_get_attribute_va(struct dcb *dcb, const char *dev, int attr, + void **payload_p, __u16 *payload_len_p) +{ + return __dcb_get_attribute(dcb, DCB_CMD_IEEE_GET, dev, attr, + payload_p, payload_len_p, + dcb_get_attribute_cb); +} + +int dcb_get_attribute_bare(struct dcb *dcb, int cmd, const char *dev, int attr, + void **payload_p, __u16 *payload_len_p) +{ + return __dcb_get_attribute(dcb, cmd, dev, attr, + payload_p, payload_len_p, + dcb_get_attribute_bare_cb); +} + +int dcb_get_attribute(struct dcb *dcb, const char *dev, int attr, void *data, size_t data_len) +{ + __u16 payload_len; + void *payload; + int ret; + + ret = dcb_get_attribute_va(dcb, dev, attr, &payload, &payload_len); + if (ret) + return ret; + + if (payload_len != data_len) { + fprintf(stderr, "Wrong len %d, expected %zd\n", payload_len, data_len); + return -EINVAL; + } + + memcpy(data, payload, data_len); return 0; } diff --git a/dcb/dcb.h b/dcb/dcb.h index da14937c..8c7327a4 100644 --- a/dcb/dcb.h +++ b/dcb/dcb.h @@ -33,9 +33,13 @@ int dcb_get_attribute(struct dcb *dcb, const char *dev, int attr, void *data, size_t data_len); int dcb_set_attribute(struct dcb *dcb, const char *dev, int attr, const void *data, size_t data_len); +int dcb_get_attribute_va(struct dcb *dcb, const char *dev, int attr, + void **payload_p, __u16 *payload_len_p); int dcb_set_attribute_va(struct dcb *dcb, int command, const char *dev, int (*cb)(struct dcb *dcb, struct nlmsghdr *nlh, void *data), void *data); +int dcb_get_attribute_bare(struct dcb *dcb, int cmd, const char *dev, int attr, + void **payload_p, __u16 *payload_len_p); int dcb_set_attribute_bare(struct dcb *dcb, int command, const char *dev, int attr, const void *data, size_t data_len, int response_attr); From 0aebd32b82aaca867c432eb6e7f6039ea3bd766d Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 2 Jan 2021 01:03:39 +0100 Subject: [PATCH 08/32] dcb: Support -N to suppress translation to human-readable names Some DSCP values can be translated to symbolic names. That may not be always desirable. Introduce a command-line option similar to other tools, -N or --Numeric, to suppress this translation. Signed-off-by: Petr Machata Signed-off-by: David Ahern --- dcb/dcb.c | 9 +++++++-- dcb/dcb.h | 1 + man/man8/dcb.8 | 5 +++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/dcb/dcb.c b/dcb/dcb.c index 89f9b0ec..cc5103da 100644 --- a/dcb/dcb.c +++ b/dcb/dcb.c @@ -467,7 +467,8 @@ static void dcb_help(void) " dcb [ -f | --force ] { -b | --batch } filename [ -n | --netns ] netnsname\n" "where OBJECT := { buffer | ets | maxrate | pfc }\n" " OPTIONS := [ -V | --Version | -i | --iec | -j | --json\n" - " | -p | --pretty | -s | --statistics | -v | --verbose]\n"); + " | -N | --Numeric | -p | --pretty\n" + " | -s | --statistics | -v | --verbose]\n"); } static int dcb_cmd(struct dcb *dcb, int argc, char **argv) @@ -509,6 +510,7 @@ int main(int argc, char **argv) { "batch", required_argument, NULL, 'b' }, { "iec", no_argument, NULL, 'i' }, { "json", no_argument, NULL, 'j' }, + { "Numeric", no_argument, NULL, 'N' }, { "pretty", no_argument, NULL, 'p' }, { "statistics", no_argument, NULL, 's' }, { "netns", required_argument, NULL, 'n' }, @@ -528,7 +530,7 @@ int main(int argc, char **argv) return EXIT_FAILURE; } - while ((opt = getopt_long(argc, argv, "b:fhijn:psvV", + while ((opt = getopt_long(argc, argv, "b:fhijn:psvNV", long_options, NULL)) >= 0) { switch (opt) { @@ -545,6 +547,9 @@ int main(int argc, char **argv) case 'j': dcb->json_output = true; break; + case 'N': + dcb->numeric = true; + break; case 'p': pretty = true; break; diff --git a/dcb/dcb.h b/dcb/dcb.h index 8c7327a4..37657c59 100644 --- a/dcb/dcb.h +++ b/dcb/dcb.h @@ -14,6 +14,7 @@ struct dcb { bool json_output; bool stats; bool use_iec; + bool numeric; }; int dcb_parse_mapping(const char *what_key, __u32 key, __u32 max_key, diff --git a/man/man8/dcb.8 b/man/man8/dcb.8 index 7293bb30..1e161eb3 100644 --- a/man/man8/dcb.8 +++ b/man/man8/dcb.8 @@ -59,6 +59,11 @@ the 1000-based ones (K, M, B). .BR "\-j" , " --json" Generate JSON output. +.TP +.BR "\-N" , " --Numeric" +If the subtool in question translates numbers to symbolic names in some way, +suppress this translation. + .TP .BR "\-p" , " --pretty" When combined with -j generate a pretty JSON output. From 8e9bed1493f5649232b063a1c12331cf8f517491 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 2 Jan 2021 01:03:40 +0100 Subject: [PATCH 09/32] dcb: Add a subtool for the DCB APP object DCB APP interfaces are standardized in 802.1q-2018, and allow configuration of traffic prioritization rules based on several possible headers. Add a dcb subtool for maintenance and display of the APP table. For example: # dcb app add dev eni1np1 dscp-prio 0:0 CS3:3 CS6:6 # dcb app show dev eni1np1 dscp-prio 0:0 CS3:3 CS6:6 # dcb app add dev eni1np1 dscp-prio CS3:4 # dcb app show dev eni1np1 dscp-prio 0:0 CS3:3 CS3:4 CS6:6 # dcb app replace dev eni1np1 dscp-prio CS3:5 # dcb app show dev eni1np1 dscp-prio 0:0 CS3:5 CS6:6 Signed-off-by: Petr Machata Signed-off-by: David Ahern --- dcb/Makefile | 7 +- dcb/dcb.c | 4 +- dcb/dcb.h | 4 + dcb/dcb_app.c | 796 +++++++++++++++++++++++++++++++++++++++++++++ man/man8/dcb-app.8 | 237 ++++++++++++++ man/man8/dcb.8 | 7 +- 6 files changed, 1052 insertions(+), 3 deletions(-) create mode 100644 dcb/dcb_app.c create mode 100644 man/man8/dcb-app.8 diff --git a/dcb/Makefile b/dcb/Makefile index 4add954b..13d45f2b 100644 --- a/dcb/Makefile +++ b/dcb/Makefile @@ -5,7 +5,12 @@ TARGETS := ifeq ($(HAVE_MNL),y) -DCBOBJ = dcb.o dcb_buffer.o dcb_ets.o dcb_maxrate.o dcb_pfc.o +DCBOBJ = dcb.o \ + dcb_app.o \ + dcb_buffer.o \ + dcb_ets.o \ + dcb_maxrate.o \ + dcb_pfc.o TARGETS += dcb endif diff --git a/dcb/dcb.c b/dcb/dcb.c index cc5103da..644059c4 100644 --- a/dcb/dcb.c +++ b/dcb/dcb.c @@ -465,7 +465,7 @@ static void dcb_help(void) fprintf(stderr, "Usage: dcb [ OPTIONS ] OBJECT { COMMAND | help }\n" " dcb [ -f | --force ] { -b | --batch } filename [ -n | --netns ] netnsname\n" - "where OBJECT := { buffer | ets | maxrate | pfc }\n" + "where OBJECT := { app | buffer | ets | maxrate | pfc }\n" " OPTIONS := [ -V | --Version | -i | --iec | -j | --json\n" " | -N | --Numeric | -p | --pretty\n" " | -s | --statistics | -v | --verbose]\n"); @@ -476,6 +476,8 @@ static int dcb_cmd(struct dcb *dcb, int argc, char **argv) if (!argc || matches(*argv, "help") == 0) { dcb_help(); return 0; + } else if (matches(*argv, "app") == 0) { + return dcb_cmd_app(dcb, argc - 1, argv + 1); } else if (matches(*argv, "buffer") == 0) { return dcb_cmd_buffer(dcb, argc - 1, argv + 1); } else if (matches(*argv, "ets") == 0) { diff --git a/dcb/dcb.h b/dcb/dcb.h index 37657c59..92eada16 100644 --- a/dcb/dcb.h +++ b/dcb/dcb.h @@ -54,6 +54,10 @@ void dcb_print_array_on_off(const __u8 *array, size_t size); void dcb_print_array_kw(const __u8 *array, size_t array_size, const char *const kw[], size_t kw_size); +/* dcb_app.c */ + +int dcb_cmd_app(struct dcb *dcb, int argc, char **argv); + /* dcb_buffer.c */ int dcb_cmd_buffer(struct dcb *dcb, int argc, char **argv); diff --git a/dcb/dcb_app.c b/dcb/dcb_app.c new file mode 100644 index 00000000..7ce80f85 --- /dev/null +++ b/dcb/dcb_app.c @@ -0,0 +1,796 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include +#include + +#include "dcb.h" +#include "utils.h" +#include "rt_names.h" + +static void dcb_app_help_add(void) +{ + fprintf(stderr, + "Usage: dcb app { add | del | replace } dev STRING\n" + " [ default-prio PRIO ]\n" + " [ ethtype-prio ET:PRIO ]\n" + " [ stream-port-prio PORT:PRIO ]\n" + " [ dgram-port-prio PORT:PRIO ]\n" + " [ port-prio PORT:PRIO ]\n" + " [ dscp-prio INTEGER:PRIO ]\n" + "\n" + " where PRIO := { 0 .. 7 }\n" + " ET := { 0x600 .. 0xffff }\n" + " PORT := { 1 .. 65535 }\n" + " DSCP := { 0 .. 63 }\n" + "\n" + ); +} + +static void dcb_app_help_show_flush(void) +{ + fprintf(stderr, + "Usage: dcb app { show | flush } dev STRING\n" + " [ default-prio ]\n" + " [ ethtype-prio ]\n" + " [ stream-port-prio ]\n" + " [ dgram-port-prio ]\n" + " [ port-prio ]\n" + " [ dscp-prio ]\n" + "\n" + ); +} + +static void dcb_app_help(void) +{ + fprintf(stderr, + "Usage: dcb app help\n" + "\n" + ); + dcb_app_help_show_flush(); + dcb_app_help_add(); +} + +struct dcb_app_table { + struct dcb_app *apps; + size_t n_apps; +}; + +static void dcb_app_table_fini(struct dcb_app_table *tab) +{ + free(tab->apps); +} + +static int dcb_app_table_push(struct dcb_app_table *tab, struct dcb_app *app) +{ + struct dcb_app *apps = reallocarray(tab->apps, tab->n_apps + 1, + sizeof(*tab->apps)); + + if (apps == NULL) { + perror("Cannot allocate APP table"); + return -ENOMEM; + } + + tab->apps = apps; + tab->apps[tab->n_apps++] = *app; + return 0; +} + +static void dcb_app_table_remove_existing(struct dcb_app_table *a, + const struct dcb_app_table *b) +{ + size_t ia, ja; + size_t ib; + + for (ia = 0, ja = 0; ia < a->n_apps; ia++) { + struct dcb_app *aa = &a->apps[ia]; + bool found = false; + + for (ib = 0; ib < b->n_apps; ib++) { + const struct dcb_app *ab = &b->apps[ib]; + + if (aa->selector == ab->selector && + aa->protocol == ab->protocol && + aa->priority == ab->priority) { + found = true; + break; + } + } + + if (!found) + a->apps[ja++] = *aa; + } + + a->n_apps = ja; +} + +static void dcb_app_table_remove_replaced(struct dcb_app_table *a, + const struct dcb_app_table *b) +{ + size_t ia, ja; + size_t ib; + + for (ia = 0, ja = 0; ia < a->n_apps; ia++) { + struct dcb_app *aa = &a->apps[ia]; + bool present = false; + bool found = false; + + for (ib = 0; ib < b->n_apps; ib++) { + const struct dcb_app *ab = &b->apps[ib]; + + if (aa->selector == ab->selector && + aa->protocol == ab->protocol) + present = true; + else + continue; + + if (aa->priority == ab->priority) { + found = true; + break; + } + } + + /* Entries that remain in A will be removed, so keep in the + * table only APP entries whose sel/pid is mentioned in B, + * but that do not have the full sel/pid/prio match. + */ + if (present && !found) + a->apps[ja++] = *aa; + } + + a->n_apps = ja; +} + +static int dcb_app_table_copy(struct dcb_app_table *a, + const struct dcb_app_table *b) +{ + size_t i; + int ret; + + for (i = 0; i < b->n_apps; i++) { + ret = dcb_app_table_push(a, &b->apps[i]); + if (ret != 0) + return ret; + } + return 0; +} + +static int dcb_app_cmp(const struct dcb_app *a, const struct dcb_app *b) +{ + if (a->protocol < b->protocol) + return -1; + if (a->protocol > b->protocol) + return 1; + return a->priority - b->priority; +} + +static int dcb_app_cmp_cb(const void *a, const void *b) +{ + return dcb_app_cmp(a, b); +} + +static void dcb_app_table_sort(struct dcb_app_table *tab) +{ + qsort(tab->apps, tab->n_apps, sizeof(*tab->apps), dcb_app_cmp_cb); +} + +struct dcb_app_parse_mapping { + __u8 selector; + struct dcb_app_table *tab; + int err; +}; + +static void dcb_app_parse_mapping_cb(__u32 key, __u64 value, void *data) +{ + struct dcb_app_parse_mapping *pm = data; + struct dcb_app app = { + .selector = pm->selector, + .priority = value, + .protocol = key, + }; + + if (pm->err) + return; + + pm->err = dcb_app_table_push(pm->tab, &app); +} + +static int dcb_app_parse_mapping_ethtype_prio(__u32 key, char *value, void *data) +{ + __u8 prio; + + if (key < 0x600) { + fprintf(stderr, "Protocol IDs < 0x600 are reserved for EtherType\n"); + return -EINVAL; + } + + if (get_u8(&prio, value, 0)) + return -EINVAL; + + return dcb_parse_mapping("ETHTYPE", key, 0xffff, + "PRIO", prio, IEEE_8021QAZ_MAX_TCS - 1, + dcb_app_parse_mapping_cb, data); +} + +static int dcb_app_parse_dscp(__u32 *key, const char *arg) +{ + if (parse_mapping_num_all(key, arg) == 0) + return 0; + + if (rtnl_dsfield_a2n(key, arg) != 0) + return -1; + + if (*key & 0x03) { + fprintf(stderr, "The values `%s' uses non-DSCP bits.\n", arg); + return -1; + } + + /* Unshift the value to convert it from dsfield to DSCP. */ + *key >>= 2; + return 0; +} + +static int dcb_app_parse_mapping_dscp_prio(__u32 key, char *value, void *data) +{ + __u8 prio; + + if (get_u8(&prio, value, 0)) + return -EINVAL; + + return dcb_parse_mapping("DSCP", key, 63, + "PRIO", prio, IEEE_8021QAZ_MAX_TCS - 1, + dcb_app_parse_mapping_cb, data); +} + +static int dcb_app_parse_mapping_port_prio(__u32 key, char *value, void *data) +{ + __u8 prio; + + if (key == 0) { + fprintf(stderr, "Port ID of 0 is invalid\n"); + return -EINVAL; + } + + if (get_u8(&prio, value, 0)) + return -EINVAL; + + return dcb_parse_mapping("PORT", key, 0xffff, + "PRIO", prio, IEEE_8021QAZ_MAX_TCS - 1, + dcb_app_parse_mapping_cb, data); +} + +static int dcb_app_parse_default_prio(int *argcp, char ***argvp, struct dcb_app_table *tab) +{ + int argc = *argcp; + char **argv = *argvp; + int ret = 0; + + while (argc > 0) { + struct dcb_app app; + __u8 prio; + + if (get_u8(&prio, *argv, 0)) { + ret = 1; + break; + } + + app = (struct dcb_app){ + .selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE, + .protocol = 0, + .priority = prio, + }; + ret = dcb_app_table_push(tab, &app); + if (ret != 0) + break; + + argc--, argv++; + } + + *argcp = argc; + *argvp = argv; + return ret; +} + +static bool dcb_app_is_ethtype(const struct dcb_app *app) +{ + return app->selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE && + app->protocol != 0; +} + +static bool dcb_app_is_default(const struct dcb_app *app) +{ + return app->selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE && + app->protocol == 0; +} + +static bool dcb_app_is_dscp(const struct dcb_app *app) +{ + return app->selector == IEEE_8021QAZ_APP_SEL_DSCP; +} + +static bool dcb_app_is_stream_port(const struct dcb_app *app) +{ + return app->selector == IEEE_8021QAZ_APP_SEL_STREAM; +} + +static bool dcb_app_is_dgram_port(const struct dcb_app *app) +{ + return app->selector == IEEE_8021QAZ_APP_SEL_DGRAM; +} + +static bool dcb_app_is_port(const struct dcb_app *app) +{ + return app->selector == IEEE_8021QAZ_APP_SEL_ANY; +} + +static int dcb_app_print_key_dec(__u16 protocol) +{ + return print_uint(PRINT_ANY, NULL, "%d:", protocol); +} + +static int dcb_app_print_key_hex(__u16 protocol) +{ + return print_uint(PRINT_ANY, NULL, "%x:", protocol); +} + +static int dcb_app_print_key_dscp(__u16 protocol) +{ + const char *name = rtnl_dsfield_get_name(protocol << 2); + + + if (!is_json_context() && name != NULL) + return print_string(PRINT_FP, NULL, "%s:", name); + return print_uint(PRINT_ANY, NULL, "%d:", protocol); +} + +static void dcb_app_print_filtered(const struct dcb_app_table *tab, + bool (*filter)(const struct dcb_app *), + int (*print_key)(__u16 protocol), + const char *json_name, + const char *fp_name) +{ + bool first = true; + size_t i; + + for (i = 0; i < tab->n_apps; i++) { + struct dcb_app *app = &tab->apps[i]; + + if (!filter(app)) + continue; + if (first) { + open_json_array(PRINT_JSON, json_name); + print_string(PRINT_FP, NULL, "%s ", fp_name); + first = false; + } + + open_json_array(PRINT_JSON, NULL); + print_key(app->protocol); + print_uint(PRINT_ANY, NULL, "%d ", app->priority); + close_json_array(PRINT_JSON, NULL); + } + + if (!first) { + close_json_array(PRINT_JSON, json_name); + print_nl(); + } +} + +static void dcb_app_print_ethtype_prio(const struct dcb_app_table *tab) +{ + dcb_app_print_filtered(tab, dcb_app_is_ethtype, dcb_app_print_key_hex, + "ethtype_prio", "ethtype-prio"); +} + +static void dcb_app_print_dscp_prio(const struct dcb *dcb, + const struct dcb_app_table *tab) +{ + dcb_app_print_filtered(tab, dcb_app_is_dscp, + dcb->numeric ? dcb_app_print_key_dec + : dcb_app_print_key_dscp, + "dscp_prio", "dscp-prio"); +} + +static void dcb_app_print_stream_port_prio(const struct dcb_app_table *tab) +{ + dcb_app_print_filtered(tab, dcb_app_is_stream_port, dcb_app_print_key_dec, + "stream_port_prio", "stream-port-prio"); +} + +static void dcb_app_print_dgram_port_prio(const struct dcb_app_table *tab) +{ + dcb_app_print_filtered(tab, dcb_app_is_dgram_port, dcb_app_print_key_dec, + "dgram_port_prio", "dgram-port-prio"); +} + +static void dcb_app_print_port_prio(const struct dcb_app_table *tab) +{ + dcb_app_print_filtered(tab, dcb_app_is_port, dcb_app_print_key_dec, + "port_prio", "port-prio"); +} + +static void dcb_app_print_default_prio(const struct dcb_app_table *tab) +{ + bool first = true; + size_t i; + + for (i = 0; i < tab->n_apps; i++) { + if (!dcb_app_is_default(&tab->apps[i])) + continue; + if (first) { + open_json_array(PRINT_JSON, "default_prio"); + print_string(PRINT_FP, NULL, "default-prio ", NULL); + first = false; + } + print_uint(PRINT_ANY, NULL, "%d ", tab->apps[i].priority); + } + + if (!first) { + close_json_array(PRINT_JSON, "default_prio"); + print_nl(); + } +} + +static void dcb_app_print(const struct dcb *dcb, const struct dcb_app_table *tab) +{ + dcb_app_print_ethtype_prio(tab); + dcb_app_print_default_prio(tab); + dcb_app_print_dscp_prio(dcb, tab); + dcb_app_print_stream_port_prio(tab); + dcb_app_print_dgram_port_prio(tab); + dcb_app_print_port_prio(tab); +} + +static int dcb_app_get_table_attr_cb(const struct nlattr *attr, void *data) +{ + struct dcb_app_table *tab = data; + struct dcb_app *app; + int ret; + + if (mnl_attr_get_type(attr) != DCB_ATTR_IEEE_APP) { + fprintf(stderr, "Unknown attribute in DCB_ATTR_IEEE_APP_TABLE: %d\n", + mnl_attr_get_type(attr)); + return MNL_CB_OK; + } + if (mnl_attr_get_payload_len(attr) < sizeof(struct dcb_app)) { + fprintf(stderr, "DCB_ATTR_IEEE_APP payload expected to have size %zd, not %d\n", + sizeof(struct dcb_app), mnl_attr_get_payload_len(attr)); + return MNL_CB_OK; + } + + app = mnl_attr_get_payload(attr); + ret = dcb_app_table_push(tab, app); + if (ret != 0) + return MNL_CB_ERROR; + + return MNL_CB_OK; +} + +static int dcb_app_get(struct dcb *dcb, const char *dev, struct dcb_app_table *tab) +{ + uint16_t payload_len; + void *payload; + int ret; + + ret = dcb_get_attribute_va(dcb, dev, DCB_ATTR_IEEE_APP_TABLE, &payload, &payload_len); + if (ret != 0) + return ret; + + ret = mnl_attr_parse_payload(payload, payload_len, dcb_app_get_table_attr_cb, tab); + if (ret != MNL_CB_OK) + return -EINVAL; + + return 0; +} + +struct dcb_app_add_del { + const struct dcb_app_table *tab; + bool (*filter)(const struct dcb_app *app); +}; + +static int dcb_app_add_del_cb(struct dcb *dcb, struct nlmsghdr *nlh, void *data) +{ + struct dcb_app_add_del *add_del = data; + struct nlattr *nest; + size_t i; + + nest = mnl_attr_nest_start(nlh, DCB_ATTR_IEEE_APP_TABLE); + + for (i = 0; i < add_del->tab->n_apps; i++) { + const struct dcb_app *app = &add_del->tab->apps[i]; + + if (add_del->filter == NULL || add_del->filter(app)) + mnl_attr_put(nlh, DCB_ATTR_IEEE_APP, sizeof(*app), app); + } + + mnl_attr_nest_end(nlh, nest); + return 0; +} + +static int dcb_app_add_del(struct dcb *dcb, const char *dev, int command, + const struct dcb_app_table *tab, + bool (*filter)(const struct dcb_app *)) +{ + struct dcb_app_add_del add_del = { + .tab = tab, + .filter = filter, + }; + + if (tab->n_apps == 0) + return 0; + + return dcb_set_attribute_va(dcb, command, dev, dcb_app_add_del_cb, &add_del); +} + +static int dcb_cmd_app_parse_add_del(struct dcb *dcb, const char *dev, + int argc, char **argv, struct dcb_app_table *tab) +{ + struct dcb_app_parse_mapping pm = { + .tab = tab, + }; + int ret; + + if (!argc) { + dcb_app_help_add(); + return 0; + } + + do { + if (matches(*argv, "help") == 0) { + dcb_app_help_add(); + return 0; + } else if (matches(*argv, "ethtype-prio") == 0) { + NEXT_ARG(); + pm.selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE; + ret = parse_mapping(&argc, &argv, false, + &dcb_app_parse_mapping_ethtype_prio, + &pm); + } else if (matches(*argv, "default-prio") == 0) { + NEXT_ARG(); + ret = dcb_app_parse_default_prio(&argc, &argv, pm.tab); + if (ret != 0) { + fprintf(stderr, "Invalid default priority %s\n", *argv); + return ret; + } + } else if (matches(*argv, "dscp-prio") == 0) { + NEXT_ARG(); + pm.selector = IEEE_8021QAZ_APP_SEL_DSCP; + ret = parse_mapping_gen(&argc, &argv, + &dcb_app_parse_dscp, + &dcb_app_parse_mapping_dscp_prio, + &pm); + } else if (matches(*argv, "stream-port-prio") == 0) { + NEXT_ARG(); + pm.selector = IEEE_8021QAZ_APP_SEL_STREAM; + ret = parse_mapping(&argc, &argv, false, + &dcb_app_parse_mapping_port_prio, + &pm); + } else if (matches(*argv, "dgram-port-prio") == 0) { + NEXT_ARG(); + pm.selector = IEEE_8021QAZ_APP_SEL_DGRAM; + ret = parse_mapping(&argc, &argv, false, + &dcb_app_parse_mapping_port_prio, + &pm); + } else if (matches(*argv, "port-prio") == 0) { + NEXT_ARG(); + pm.selector = IEEE_8021QAZ_APP_SEL_ANY; + ret = parse_mapping(&argc, &argv, false, + &dcb_app_parse_mapping_port_prio, + &pm); + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + dcb_app_help_add(); + return -EINVAL; + } + + if (ret != 0) { + fprintf(stderr, "Invalid mapping %s\n", *argv); + return ret; + } + if (pm.err) + return pm.err; + } while (argc > 0); + + return 0; +} + +static int dcb_cmd_app_add(struct dcb *dcb, const char *dev, int argc, char **argv) +{ + struct dcb_app_table tab = {}; + int ret; + + ret = dcb_cmd_app_parse_add_del(dcb, dev, argc, argv, &tab); + if (ret != 0) + return ret; + + ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_SET, &tab, NULL); + dcb_app_table_fini(&tab); + return ret; +} + +static int dcb_cmd_app_del(struct dcb *dcb, const char *dev, int argc, char **argv) +{ + struct dcb_app_table tab = {}; + int ret; + + ret = dcb_cmd_app_parse_add_del(dcb, dev, argc, argv, &tab); + if (ret != 0) + return ret; + + ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_DEL, &tab, NULL); + dcb_app_table_fini(&tab); + return ret; +} + +static int dcb_cmd_app_show(struct dcb *dcb, const char *dev, int argc, char **argv) +{ + struct dcb_app_table tab = {}; + int ret; + + ret = dcb_app_get(dcb, dev, &tab); + if (ret != 0) + return ret; + + dcb_app_table_sort(&tab); + + open_json_object(NULL); + + if (!argc) { + dcb_app_print(dcb, &tab); + goto out; + } + + do { + if (matches(*argv, "help") == 0) { + dcb_app_help_show_flush(); + goto out; + } else if (matches(*argv, "ethtype-prio") == 0) { + dcb_app_print_ethtype_prio(&tab); + } else if (matches(*argv, "dscp-prio") == 0) { + dcb_app_print_dscp_prio(dcb, &tab); + } else if (matches(*argv, "stream-port-prio") == 0) { + dcb_app_print_stream_port_prio(&tab); + } else if (matches(*argv, "dgram-port-prio") == 0) { + dcb_app_print_dgram_port_prio(&tab); + } else if (matches(*argv, "port-prio") == 0) { + dcb_app_print_port_prio(&tab); + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + dcb_app_help_show_flush(); + ret = -EINVAL; + goto out; + } + + NEXT_ARG_FWD(); + } while (argc > 0); + +out: + close_json_object(); + dcb_app_table_fini(&tab); + return 0; +} + +static int dcb_cmd_app_flush(struct dcb *dcb, const char *dev, int argc, char **argv) +{ + struct dcb_app_table tab = {}; + int ret; + + ret = dcb_app_get(dcb, dev, &tab); + if (ret != 0) + return ret; + + if (!argc) { + ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_DEL, &tab, NULL); + goto out; + } + + do { + if (matches(*argv, "help") == 0) { + dcb_app_help_show_flush(); + goto out; + } else if (matches(*argv, "ethtype-prio") == 0) { + ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_DEL, &tab, + &dcb_app_is_ethtype); + if (ret != 0) + goto out; + } else if (matches(*argv, "default-prio") == 0) { + ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_DEL, &tab, + &dcb_app_is_default); + if (ret != 0) + goto out; + } else if (matches(*argv, "dscp-prio") == 0) { + ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_DEL, &tab, + &dcb_app_is_dscp); + if (ret != 0) + goto out; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + dcb_app_help_show_flush(); + ret = -EINVAL; + goto out; + } + + NEXT_ARG_FWD(); + } while (argc > 0); + +out: + dcb_app_table_fini(&tab); + return ret; +} + +static int dcb_cmd_app_replace(struct dcb *dcb, const char *dev, int argc, char **argv) +{ + struct dcb_app_table orig = {}; + struct dcb_app_table tab = {}; + struct dcb_app_table new = {}; + int ret; + + ret = dcb_app_get(dcb, dev, &orig); + if (ret != 0) + return ret; + + ret = dcb_cmd_app_parse_add_del(dcb, dev, argc, argv, &tab); + if (ret != 0) + goto out; + + /* Attempts to add an existing entry would be rejected, so drop + * these entries from tab. + */ + ret = dcb_app_table_copy(&new, &tab); + if (ret != 0) + goto out; + dcb_app_table_remove_existing(&new, &orig); + + ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_SET, &new, NULL); + if (ret != 0) { + fprintf(stderr, "Could not add new APP entries\n"); + goto out; + } + + /* Remove the obsolete entries. */ + dcb_app_table_remove_replaced(&orig, &tab); + ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_DEL, &orig, NULL); + if (ret != 0) { + fprintf(stderr, "Could not remove replaced APP entries\n"); + goto out; + } + +out: + dcb_app_table_fini(&new); + dcb_app_table_fini(&tab); + dcb_app_table_fini(&orig); + return 0; +} + +int dcb_cmd_app(struct dcb *dcb, int argc, char **argv) +{ + if (!argc || matches(*argv, "help") == 0) { + dcb_app_help(); + return 0; + } else if (matches(*argv, "show") == 0) { + NEXT_ARG_FWD(); + return dcb_cmd_parse_dev(dcb, argc, argv, + dcb_cmd_app_show, dcb_app_help_show_flush); + } else if (matches(*argv, "flush") == 0) { + NEXT_ARG_FWD(); + return dcb_cmd_parse_dev(dcb, argc, argv, + dcb_cmd_app_flush, dcb_app_help_show_flush); + } else if (matches(*argv, "add") == 0) { + NEXT_ARG_FWD(); + return dcb_cmd_parse_dev(dcb, argc, argv, + dcb_cmd_app_add, dcb_app_help_add); + } else if (matches(*argv, "del") == 0) { + NEXT_ARG_FWD(); + return dcb_cmd_parse_dev(dcb, argc, argv, + dcb_cmd_app_del, dcb_app_help_add); + } else if (matches(*argv, "replace") == 0) { + NEXT_ARG_FWD(); + return dcb_cmd_parse_dev(dcb, argc, argv, + dcb_cmd_app_replace, dcb_app_help_add); + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + dcb_app_help(); + return -EINVAL; + } +} diff --git a/man/man8/dcb-app.8 b/man/man8/dcb-app.8 new file mode 100644 index 00000000..23fd3374 --- /dev/null +++ b/man/man8/dcb-app.8 @@ -0,0 +1,237 @@ +.TH DCB-ETS 8 "6 December 2020" "iproute2" "Linux" +.SH NAME +dcb-app \- show / manipulate application priority table of +the DCB (Data Center Bridging) subsystem +.SH SYNOPSIS +.sp +.ad l +.in +8 + +.ti -8 +.B dcb +.RI "[ " OPTIONS " ] " +.B app +.RI "{ " COMMAND " | " help " }" +.sp + +.ti -8 +.B dcb app " { " show " | " flush " } " dev +.RI DEV +.RB "[ " default-prio " ]" +.RB "[ " ethtype-prio " ]" +.RB "[ " stream-port-prio " ]" +.RB "[ " dgram-port-prio " ]" +.RB "[ " port-prio " ]" +.RB "[ " dscp-prio " ]" + +.ti -8 +.B dcb ets " { " add " | " del " | " replace " } " dev +.RI DEV +.RB "[ " default-prio " " \fIPRIO-LIST\fB " ]" +.RB "[ " ethtype-prio " " \fIET-MAP\fB " ]" +.RB "[ " stream-port-prio " " \fIPORT-MAP\fB " ]" +.RB "[ " dgram-port-prio " " \fIPORT-MAP\fB " ]" +.RB "[ " port-prio " " \fIPORT-MAP\fB " ]" +.RB "[ " dscp-prio " " \fIDSCP-MAP\fB " ]" + +.ti -8 +.IR PRIO-LIST " := [ " PRIO-LIST " ] " PRIO + +.ti -8 +.IR ET-MAP " := [ " ET-MAP " ] " ET-MAPPING + +.ti -8 +.IR ET-MAPPING " := " ET\fB:\fIPRIO\fR + +.ti -8 +.IR PORT-MAP " := [ " PORT-MAP " ] " PORT-MAPPING + +.ti -8 +.IR PORT-MAPPING " := " PORT\fB:\fIPRIO\fR + +.ti -8 +.IR DSCP-MAP " := [ " DSCP-MAP " ] " DSCP-MAPPING + +.ti -8 +.IR DSCP-MAPPING " := { " DSCP " | " \fBall " }" \fB:\fIPRIO\fR + +.ti -8 +.IR ET " := { " \fB0x600\fR " .. " \fB0xffff\fR " }" + +.ti -8 +.IR PORT " := { " \fB1\fR " .. " \fB65535\fR " }" + +.ti -8 +.IR DSCP " := { " \fB0\fR " .. " \fB63\fR " }" + +.ti -8 +.IR PRIO " := { " \fB0\fR " .. " \fB7\fR " }" + +.SH DESCRIPTION + +.B dcb app +is used to configure APP table, or application priority table in the DCB (Data +Center Bridging) subsystem. The APP table is used to assign priority to traffic +based on value in one of several headers: EtherType, L4 destination port, or +DSCP. It also allows configuration of port-default priority that is chosen if no +other prioritization rule applies. + +DCB APP entries are 3-tuples of selector, protocol ID, and priority. Selector is +an enumeration that picks one of the prioritization namespaces. Currently it +mostly corresponds to configurable parameters described below. Protocol ID is a +value in the selector namespace. E.g. for EtherType selector, protocol IDs are +the individual EtherTypes, for DSCP they are individual code points. The +priority is the priority that should be assigned to traffic that matches the +selector and protocol ID. + +The APP table is a set of DCB APP entries. The only requirement is that +duplicate entries are not added. Notably, it is valid to have conflicting +priority assignment for the same selector and protocol ID. For example, the set +of two APP entries (DSCP, 10, 1) and (DSCP, 10, 2), where packets with DSCP of +10 should get priority of both 1 and 2, form a well-defined APP table. The +.B dcb app +tool allows low-level management of the app table by adding and deleting +individual APP 3-tuples through +.B add +and +.B del +commands. On the other other hand, the command +.B replace +does what one would typically want in this situation--first adds the new +configuration, and then removes the obsolete one, so that only one +prioritization is in effect for a given selector and protocol ID. + +.SH COMMANDS + +.TP +.B show +Display all entries with a given selector. When no selector is given, shows all +APP table entries categorized per selector. + +.TP +.B flush +Remove all entries with a given selector. When no selector is given, removes all +APP table entries. + +.TP +.B add +.TQ +.B del +Add and, respectively, remove individual APP 3-tuples to and from the DCB APP +table. + +.TP +.B replace +Take the list of entries mentioned as parameter, and add those that are not +present in the APP table yet. Then remove those entries, whose selector and +protocol ID have been mentioned as parameter, but not with the exact same +priority. This has the effect of, for the given selector and protocol ID, +causing that the table only contains the priority (or priorities) given as +parameter. + +.SH PARAMETERS + +The following table shows parameters in a way that they would be used with +\fBadd\fR, \fBdel\fR and \fBreplace\fR commands. For \fBshow\fR and \fBflush\fR, +the parameter name is to be used as a simple keyword without further arguments. + +.TP +.B default-prio \fIPRIO-LIST +The priority to be used for traffic the priority of which is otherwise +unspecified. The argument is a list of individual priorities. Note that +.B default-prio +rules are configured as triplets (\fBEtherType\fR, \fB0\fR, \fIPRIO\fR). +.B dcb app +translates these rules to the symbolic name +.B default-prio +and back. + +.TP +.B ethtype-prio \fIET-MAP +\fIET-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are EtherType values. Values are priorities to be assigned to +traffic with the matching EtherType. + +.TP +.B stream-port-prio \fIPORT-MAP +.TQ +.B dgram-port-prio \fIPORT-MAP +.TQ +.B port-prio \fIPORT-MAP +\fIPORT-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are L4 destination port numbers that match on, respectively, +TCP and SCTP traffic, UDP and DCCP traffic, and either of those. Values are +priorities that should be assigned to matching traffic. + +.TP +.B dscp-prio \fIDSCP-MAP +\fIDSCP-MAP\fR uses the array parameter syntax, see +.BR dcb (8) +for details. Keys are DSCP points, values are priorities assigned to +traffic with matching DSCP. DSCP points can be written either direcly as +numeric values, or using symbolic names specified in +.B /etc/iproute2/rt_dsfield +(however note that that file specifies full 8-bit dsfield values, whereas +.B dcb app +will only use the higher six bits). +.B dcb app show +will similarly format DSCP values as symbolic names if possible. The +command line option +.B -N +turns the show translation off. + +.SH EXAMPLE & USAGE + +Prioritize traffic with DSCP 0 to priority 0, 24 to 3 and 48 to 6: + +.P +# dcb app add dev eth0 dscp-prio 0:0 24:3 48:6 + +Add another rule to configure DSCP 24 to priority 2 and show the result: + +.P +# dcb app add dev eth0 dscp-prio 24:2 +.br +# dcb app show dev eth0 dscp-prio +.br +dscp-prio 0:0 CS3:2 CS3:3 CS6:6 +.br +# dcb -N app show dev eth0 dscp-prio +.br +dscp-prio 0:0 24:2 24:3 48:6 + +Reconfigure the table so that the only rule for DSCP 24 is for assignment of +priority 4: + +.P +# dcb app replace dev eth0 dscp-prio 24:4 +.br +# dcb app show dev eth0 dscp-prio +.br +dscp-prio 0:0 24:4 48:6 + +Flush all DSCP rules: + +.P +# dcb app flush dev eth0 dscp-prio +.br +# dcb app show dev eth0 dscp-prio +.br +(nothing) + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR dcb (8) + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Petr Machata diff --git a/man/man8/dcb.8 b/man/man8/dcb.8 index 1e161eb3..24944b73 100644 --- a/man/man8/dcb.8 +++ b/man/man8/dcb.8 @@ -9,7 +9,7 @@ dcb \- show / manipulate DCB (Data Center Bridging) settings .ti -8 .B dcb .RI "[ " OPTIONS " ] " -.RB "{ " buffer " | " ets " | " maxrate " | " pfc " }" +.RB "{ " app " | " buffer " | " ets " | " maxrate " | " pfc " }" .RI "{ " COMMAND " | " help " }" .sp @@ -75,6 +75,10 @@ part of the "show" output. .SH OBJECTS +.TP +.B app +- Configuration of application priority table + .TP .B buffer - Configuration of port buffers @@ -135,6 +139,7 @@ other values: Exit status is 0 if command was successful or a positive integer upon failure. .SH SEE ALSO +.BR dcb-app (8), .BR dcb-buffer (8), .BR dcb-ets (8), .BR dcb-maxrate (8), From 89d11ea596498841a8e0e93643cb3141fb8e6e14 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 2 Jan 2021 01:03:41 +0100 Subject: [PATCH 10/32] dcb: Add a subtool for the DCBX object The Linux DCBX object is a 1-byte bitfield of flags that configure whether the DCBX protocol is implemented in the device or in the host, and which version of the protocol should be used. Add a tool to access the per-port Linux DCBX object. For example: # dcb dcbx set dev eni1np1 host ieee # dcb dcbx show dev eni1np1 host ieee Signed-off-by: Petr Machata Signed-off-by: David Ahern --- dcb/Makefile | 1 + dcb/dcb.c | 4 +- dcb/dcb.h | 4 + dcb/dcb_dcbx.c | 192 ++++++++++++++++++++++++++++++++++++++++++++ man/man8/dcb-dcbx.8 | 108 +++++++++++++++++++++++++ 5 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 dcb/dcb_dcbx.c create mode 100644 man/man8/dcb-dcbx.8 diff --git a/dcb/Makefile b/dcb/Makefile index 13d45f2b..02d5d044 100644 --- a/dcb/Makefile +++ b/dcb/Makefile @@ -8,6 +8,7 @@ ifeq ($(HAVE_MNL),y) DCBOBJ = dcb.o \ dcb_app.o \ dcb_buffer.o \ + dcb_dcbx.o \ dcb_ets.o \ dcb_maxrate.o \ dcb_pfc.o diff --git a/dcb/dcb.c b/dcb/dcb.c index 644059c4..64a9ef02 100644 --- a/dcb/dcb.c +++ b/dcb/dcb.c @@ -465,7 +465,7 @@ static void dcb_help(void) fprintf(stderr, "Usage: dcb [ OPTIONS ] OBJECT { COMMAND | help }\n" " dcb [ -f | --force ] { -b | --batch } filename [ -n | --netns ] netnsname\n" - "where OBJECT := { app | buffer | ets | maxrate | pfc }\n" + "where OBJECT := { app | buffer | dcbx | ets | maxrate | pfc }\n" " OPTIONS := [ -V | --Version | -i | --iec | -j | --json\n" " | -N | --Numeric | -p | --pretty\n" " | -s | --statistics | -v | --verbose]\n"); @@ -480,6 +480,8 @@ static int dcb_cmd(struct dcb *dcb, int argc, char **argv) return dcb_cmd_app(dcb, argc - 1, argv + 1); } else if (matches(*argv, "buffer") == 0) { return dcb_cmd_buffer(dcb, argc - 1, argv + 1); + } else if (matches(*argv, "dcbx") == 0) { + return dcb_cmd_dcbx(dcb, argc - 1, argv + 1); } else if (matches(*argv, "ets") == 0) { return dcb_cmd_ets(dcb, argc - 1, argv + 1); } else if (matches(*argv, "maxrate") == 0) { diff --git a/dcb/dcb.h b/dcb/dcb.h index 92eada16..244c3d3c 100644 --- a/dcb/dcb.h +++ b/dcb/dcb.h @@ -62,6 +62,10 @@ int dcb_cmd_app(struct dcb *dcb, int argc, char **argv); int dcb_cmd_buffer(struct dcb *dcb, int argc, char **argv); +/* dcb_dcbx.c */ + +int dcb_cmd_dcbx(struct dcb *dcb, int argc, char **argv); + /* dcb_ets.c */ int dcb_cmd_ets(struct dcb *dcb, int argc, char **argv); diff --git a/dcb/dcb_dcbx.c b/dcb/dcb_dcbx.c new file mode 100644 index 00000000..244b671b --- /dev/null +++ b/dcb/dcb_dcbx.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include + +#include "dcb.h" +#include "utils.h" + +static void dcb_dcbx_help_set(void) +{ + fprintf(stderr, + "Usage: dcb dcbx set dev STRING\n" + " [ host | lld-managed ]\n" + " [ cee | ieee ] [ static ]\n" + "\n" + ); +} + +static void dcb_dcbx_help_show(void) +{ + fprintf(stderr, + "Usage: dcb dcbx show dev STRING\n" + "\n" + ); +} + +static void dcb_dcbx_help(void) +{ + fprintf(stderr, + "Usage: dcb dcbx help\n" + "\n" + ); + dcb_dcbx_help_show(); + dcb_dcbx_help_set(); +} + +struct dcb_dcbx_flag { + __u8 value; + const char *key_fp; + const char *key_json; +}; + +static struct dcb_dcbx_flag dcb_dcbx_flags[] = { + {DCB_CAP_DCBX_HOST, "host"}, + {DCB_CAP_DCBX_LLD_MANAGED, "lld-managed", "lld_managed"}, + {DCB_CAP_DCBX_VER_CEE, "cee"}, + {DCB_CAP_DCBX_VER_IEEE, "ieee"}, + {DCB_CAP_DCBX_STATIC, "static"}, +}; + +static void dcb_dcbx_print(__u8 dcbx) +{ + int bit; + int i; + + while ((bit = ffs(dcbx))) { + bool found = false; + + bit--; + for (i = 0; i < ARRAY_SIZE(dcb_dcbx_flags); i++) { + struct dcb_dcbx_flag *flag = &dcb_dcbx_flags[i]; + + if (flag->value == 1 << bit) { + print_bool(PRINT_JSON, flag->key_json ?: flag->key_fp, + NULL, true); + print_string(PRINT_FP, NULL, "%s ", flag->key_fp); + found = true; + break; + } + } + + if (!found) + fprintf(stderr, "Unknown DCBX bit %#x.\n", 1 << bit); + + dcbx &= ~(1 << bit); + } + + print_nl(); +} + +static int dcb_dcbx_get(struct dcb *dcb, const char *dev, __u8 *dcbx) +{ + __u16 payload_len; + void *payload; + int err; + + err = dcb_get_attribute_bare(dcb, DCB_CMD_IEEE_GET, dev, DCB_ATTR_DCBX, + &payload, &payload_len); + if (err != 0) + return err; + + if (payload_len != 1) { + fprintf(stderr, "DCB_ATTR_DCBX payload has size %d, expected 1.\n", + payload_len); + return -EINVAL; + } + *dcbx = *(__u8 *) payload; + return 0; +} + +static int dcb_dcbx_set(struct dcb *dcb, const char *dev, __u8 dcbx) +{ + return dcb_set_attribute_bare(dcb, DCB_CMD_SDCBX, dev, DCB_ATTR_DCBX, + &dcbx, 1, DCB_ATTR_DCBX); +} + +static int dcb_cmd_dcbx_set(struct dcb *dcb, const char *dev, int argc, char **argv) +{ + __u8 dcbx = 0; + __u8 i; + + if (!argc) { + dcb_dcbx_help_set(); + return 0; + } + + do { + if (matches(*argv, "help") == 0) { + dcb_dcbx_help_set(); + return 0; + } + + for (i = 0; i < ARRAY_SIZE(dcb_dcbx_flags); i++) { + struct dcb_dcbx_flag *flag = &dcb_dcbx_flags[i]; + + if (matches(*argv, flag->key_fp) == 0) { + dcbx |= flag->value; + NEXT_ARG_FWD(); + goto next; + } + } + + fprintf(stderr, "What is \"%s\"?\n", *argv); + dcb_dcbx_help_set(); + return -EINVAL; + +next: + ; + } while (argc > 0); + + return dcb_dcbx_set(dcb, dev, dcbx); +} + +static int dcb_cmd_dcbx_show(struct dcb *dcb, const char *dev, int argc, char **argv) +{ + __u8 dcbx; + int ret; + + ret = dcb_dcbx_get(dcb, dev, &dcbx); + if (ret != 0) + return ret; + + while (argc > 0) { + if (matches(*argv, "help") == 0) { + dcb_dcbx_help_show(); + return 0; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + dcb_dcbx_help_show(); + return -EINVAL; + } + + NEXT_ARG_FWD(); + } + + open_json_object(NULL); + dcb_dcbx_print(dcbx); + close_json_object(); + return 0; +} + +int dcb_cmd_dcbx(struct dcb *dcb, int argc, char **argv) +{ + if (!argc || matches(*argv, "help") == 0) { + dcb_dcbx_help(); + return 0; + } else if (matches(*argv, "show") == 0) { + NEXT_ARG_FWD(); + return dcb_cmd_parse_dev(dcb, argc, argv, + dcb_cmd_dcbx_show, dcb_dcbx_help_show); + } else if (matches(*argv, "set") == 0) { + NEXT_ARG_FWD(); + return dcb_cmd_parse_dev(dcb, argc, argv, + dcb_cmd_dcbx_set, dcb_dcbx_help_set); + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + dcb_dcbx_help(); + return -EINVAL; + } +} diff --git a/man/man8/dcb-dcbx.8 b/man/man8/dcb-dcbx.8 new file mode 100644 index 00000000..52133e34 --- /dev/null +++ b/man/man8/dcb-dcbx.8 @@ -0,0 +1,108 @@ +.TH DCB-DCBX 8 "13 December 2020" "iproute2" "Linux" +.SH NAME +dcb-dcbx \- show / manipulate port DCBX (Data Center Bridging eXchange) +.SH SYNOPSIS +.sp +.ad l +.in +8 + +.ti -8 +.B dcb +.RI "[ " OPTIONS " ] " +.B dcbx +.RI "{ " COMMAND " | " help " }" +.sp + +.ti -8 +.B dcb dcbx show dev +.RI DEV + +.ti -8 +.B dcb dcbx set dev +.RI DEV +.RB "[ " host " ]" +.RB "[ " lld-managed " ]" +.RB "[ " cee " ]" +.RB "[ " ieee " ]" +.RB "[ " static " ]" + +.SH DESCRIPTION + +Data Center Bridging eXchange (DCBX) is a protocol used by DCB devices to +exchange configuration information with directly connected peers. The Linux DCBX +object is a 1-byte bitfield of flags that configure whether DCBX is implemented +in the device or in the host, and which version of the protocol should be used. +.B dcb dcbx +is used to access the per-port Linux DCBX object. + +There are two principal modes of operation: in +.B host +mode, DCBX protocol is implemented by the host LLDP agent, and the DCB +interfaces are used to propagate the negotiate parameters to capable devices. In +.B lld-managed +mode, the configuration is handled by the device, and DCB interfaces are used +for inspection of negotiated parameters, and can also be used to set initial +parameters. + +.SH PARAMETERS + +When used with +.B dcb dcbx set, +the following keywords enable the corresponding configuration. The keywords that +are not mentioned on the command line are considered disabled. When used with +.B show, +each enabled feature is shown by its corresponding keyword. + +.TP +.B host +.TQ +.B lld-managed +The device is in the host mode of operation and, respectively, the lld-managed +mode of operation, as described above. In principle these two keywords are +mutually exclusive, but +.B dcb dcbx +allows setting both and lets the driver handle it as appropriate. + +.TP +.B cee +.TQ +.B ieee +The device supports CEE (Converged Enhanced Ethernet) and, respecively, IEEE +version of the DCB specification. Typically only one of these will be set, but +.B dcb dcbx +does not mandate this. + +.TP +.B static +indicates the engine supports static configuration. No actual negotiation is +performed, negotiated parameters are always the initial configuration. + +.SH EXAMPLE & USAGE + +Put the DCB engine into the "host" mode of operation, and use IEEE-standardized +DCB interfaces: + +.P +# dcb dcbx set dev eth0 host ieee + +Show what was set: + +.P +# dcb dcbx show dev eth0 +.br +host ieee + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR dcb (8) + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Petr Machata From c81a173f6b1b53492f91dd1afff803790048c1d3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 23 Jan 2021 18:15:57 +0000 Subject: [PATCH 11/32] Update kernel headers Update kernel headers to commit: 59a49d9617e2 ("Merge branch 'mlxsw-expose-number-of-physical-ports'") Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 20 +++++++++++++------- include/uapi/linux/if_bonding.h | 1 + include/uapi/linux/if_link.h | 1 + include/uapi/linux/if_tunnel.h | 1 + include/uapi/linux/mptcp.h | 1 + include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/pkt_sched.h | 1 + include/uapi/linux/tcp.h | 23 ++++++++++++++--------- 8 files changed, 33 insertions(+), 16 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bd28bc76..1daeda13 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -19,7 +19,8 @@ /* ld/ldx fields */ #define BPF_DW 0x18 /* double word (64-bit) */ -#define BPF_XADD 0xc0 /* exclusive add */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ /* alu/jmp fields */ #define BPF_MOV 0xb0 /* mov reg to reg */ @@ -43,6 +44,11 @@ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ + /* Register numbers */ enum { BPF_REG_0 = 0, @@ -2448,7 +2454,7 @@ union bpf_attr { * running simultaneously. * * A user should care about the synchronization by himself. - * For example, by using the **BPF_STX_XADD** instruction to alter + * For example, by using the **BPF_ATOMIC** instructions to alter * the shared data. * Return * A pointer to the local storage area. @@ -2993,10 +2999,10 @@ union bpf_attr { * string length is larger than *size*, just *size*-1 bytes are * copied and the last byte is set to NUL. * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: * * :: * @@ -3024,7 +3030,7 @@ union bpf_attr { * **->mm->env_start**: using this helper and the return value, * one can quickly iterate at the right offset of the memory area. * Return - * On success, the strictly positive length of the string, + * On success, the strictly positive length of the output string, * including the trailing NUL character. On error, a negative * value. * diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index 45f3750a..e8eb4ad0 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -94,6 +94,7 @@ #define BOND_XMIT_POLICY_LAYER23 2 /* layer 2+3 (IP ^ MAC) */ #define BOND_XMIT_POLICY_ENCAP23 3 /* encapsulated layer 2+3 */ #define BOND_XMIT_POLICY_ENCAP34 4 /* encapsulated layer 3+4 */ +#define BOND_XMIT_POLICY_VLAN_SRCMAC 5 /* vlan + source MAC */ /* 802.3ad port state definitions (43.4.2.2 in the 802.3ad standard) */ #define LACP_STATE_LACP_ACTIVITY 0x1 diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 97a95cf7..b4cf6679 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -807,6 +807,7 @@ enum { IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, + IFLA_GTP_COLLECT_METADATA, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index c7f0a5e6..cb32781c 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -176,6 +176,7 @@ enum { #define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) #define TUNNEL_NOCACHE __cpu_to_be16(0x2000) #define TUNNEL_ERSPAN_OPT __cpu_to_be16(0x4000) +#define TUNNEL_GTPU_OPT __cpu_to_be16(0x8000) #define TUNNEL_OPTIONS_PRESENT \ (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT) diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 32181230..d31c9c6c 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -82,6 +82,7 @@ enum { MPTCP_PM_CMD_FLUSH_ADDRS, MPTCP_PM_CMD_SET_LIMITS, MPTCP_PM_CMD_GET_LIMITS, + MPTCP_PM_CMD_SET_FLAGS, __MPTCP_PM_CMD_AFTER_LAST }; diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index ee95f42f..709668e2 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -591,6 +591,7 @@ enum { TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */ TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */ TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ + TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */ }; enum { diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 9e7c2c60..79a699f1 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -434,6 +434,7 @@ enum { TCA_HTB_RATE64, TCA_HTB_CEIL64, TCA_HTB_PAD, + TCA_HTB_OFFLOAD, __TCA_HTB_MAX, }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 6f1a38fc..0614c608 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -51,7 +51,7 @@ struct tcphdr { fin:1; #else #error "Adjust your defines" -#endif +#endif __be16 window; __sum16 check; __be16 urg_ptr; @@ -62,14 +62,14 @@ struct tcphdr { * (union is compatible to any of its members) * This means this part of the code is -fstrict-aliasing safe now. */ -union tcp_word_hdr { +union tcp_word_hdr { struct tcphdr hdr; - __be32 words[5]; -}; + __be32 words[5]; +}; -#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) +#define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) -enum { +enum { TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), @@ -80,7 +80,7 @@ enum { TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) -}; +}; /* * TCP general constants @@ -103,8 +103,8 @@ enum { #define TCP_QUICKACK 12 /* Block/reenable quick acks */ #define TCP_CONGESTION 13 /* Congestion control algorithm */ #define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ -#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ -#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ +#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ +#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ #define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ #define TCP_REPAIR 19 /* TCP sock is under repair right now */ #define TCP_REPAIR_QUEUE 20 @@ -314,6 +314,7 @@ enum { TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */ + TCP_NLA_TTL, /* TTL or hop limit of a packet received */ }; /* for TCP_MD5SIG socket option */ @@ -353,5 +354,9 @@ struct tcp_zerocopy_receive { __u64 copybuf_address; /* in: copybuf address (small reads) */ __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ __u32 flags; /* in: flags */ + __u64 msg_control; /* ancillary data */ + __u64 msg_controllen; + __u32 msg_flags; + /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ }; #endif /* _LINUX_TCP_H */ From c94fd71b340ddf95177b90e2d0be67bcca3eefe3 Mon Sep 17 00:00:00 2001 From: wenxu Date: Wed, 20 Jan 2021 10:52:12 +0800 Subject: [PATCH 12/32] tc: flower: add tc conntrack inv ct_state support Matches on conntrack inv ct_state. Signed-off-by: wenxu Signed-off-by: David Ahern --- man/man8/tc-flower.8 | 2 ++ tc/f_flower.c | 1 + 2 files changed, 3 insertions(+) diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8 index 1a76b375..226d1cc6 100644 --- a/man/man8/tc-flower.8 +++ b/man/man8/tc-flower.8 @@ -387,6 +387,8 @@ new - New connection. .TP est - Established connection. .TP +inv - The state is invalid. The packet couldn't be associated to a connection. +.TP Example: +trk+est .RE .TP diff --git a/tc/f_flower.c b/tc/f_flower.c index 1fe0ef42..85c1043a 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -345,6 +345,7 @@ static struct flower_ct_states { { "trk", TCA_FLOWER_KEY_CT_FLAGS_TRACKED }, { "new", TCA_FLOWER_KEY_CT_FLAGS_NEW }, { "est", TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED }, + { "inv", TCA_FLOWER_KEY_CT_FLAGS_INVALID }, }; static int flower_parse_ct_state(char *str, struct nlmsghdr *n) From 7887500008c6c75626b0c35f9c83f0eb510a5418 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Fri, 15 Jan 2021 14:21:37 -0500 Subject: [PATCH 13/32] bond: support xmit_hash_policy=vlan+srcmac There's a new transmit hash policy being added to the bonding driver that is a simple XOR of vlan ID and source MAC, xmit_hash_policy vlan+srcmac. This trivial patch makes it configurable and queryable via iproute2. $ sudo modprobe bonding mode=2 max_bonds=1 xmit_hash_policy=0 $ sudo ip link set bond0 type bond xmit_hash_policy vlan+srcmac $ ip -d link show bond0 11: bond0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether ce:85:5e:24:ce:90 brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535 bond mode balance-xor miimon 0 updelay 0 downdelay 0 peer_notify_delay 0 use_carrier 1 arp_interval 0 arp_validate none arp_all_targets any primary_reselect always fail_over_mac none xmit_hash_policy vlan+srcmac resend_igmp 1 num_grat_arp 1 all_slaves_active 0 min_links 0 lp_interval 1 packets_per_slave 1 lacp_rate slow ad_select stable tlb_dynamic_lb 1 addrgenmode eui64 numtxqueues 16 numrxqueues 16 gso_max_size 65536 gso_max_segs 65535 $ grep Hash /proc/net/bonding/bond0 Transmit Hash Policy: vlan+srcmac (5) $ sudo ip link add test type bond help Usage: ... bond [ mode BONDMODE ] [ active_slave SLAVE_DEV ] [ clear_active_slave ] [ miimon MIIMON ] [ updelay UPDELAY ] [ downdelay DOWNDELAY ] [ peer_notify_delay DELAY ] [ use_carrier USE_CARRIER ] [ arp_interval ARP_INTERVAL ] [ arp_validate ARP_VALIDATE ] [ arp_all_targets ARP_ALL_TARGETS ] [ arp_ip_target [ ARP_IP_TARGET, ... ] ] [ primary SLAVE_DEV ] [ primary_reselect PRIMARY_RESELECT ] [ fail_over_mac FAIL_OVER_MAC ] [ xmit_hash_policy XMIT_HASH_POLICY ] [ resend_igmp RESEND_IGMP ] [ num_grat_arp|num_unsol_na NUM_GRAT_ARP|NUM_UNSOL_NA ] [ all_slaves_active ALL_SLAVES_ACTIVE ] [ min_links MIN_LINKS ] [ lp_interval LP_INTERVAL ] [ packets_per_slave PACKETS_PER_SLAVE ] [ tlb_dynamic_lb TLB_DYNAMIC_LB ] [ lacp_rate LACP_RATE ] [ ad_select AD_SELECT ] [ ad_user_port_key PORTKEY ] [ ad_actor_sys_prio SYSPRIO ] [ ad_actor_system LLADDR ] BONDMODE := balance-rr|active-backup|balance-xor|broadcast|802.3ad|balance-tlb|balance-alb ARP_VALIDATE := none|active|backup|all ARP_ALL_TARGETS := any|all PRIMARY_RESELECT := always|better|failure FAIL_OVER_MAC := none|active|follow XMIT_HASH_POLICY := layer2|layer2+3|layer3+4|encap2+3|encap3+4|vlan+srcmac LACP_RATE := slow|fast AD_SELECT := stable|bandwidth|count Cc: Stephen Hemminger Cc: Jay Vosburgh Signed-off-by: Jarod Wilson Signed-off-by: David Ahern --- ip/iplink_bond.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ip/iplink_bond.c b/ip/iplink_bond.c index 585b6be1..d45845bd 100644 --- a/ip/iplink_bond.c +++ b/ip/iplink_bond.c @@ -70,6 +70,7 @@ static const char *xmit_hash_policy_tbl[] = { "layer2+3", "encap2+3", "encap3+4", + "vlan+srcmac", NULL, }; @@ -148,7 +149,7 @@ static void print_explain(FILE *f) "ARP_ALL_TARGETS := any|all\n" "PRIMARY_RESELECT := always|better|failure\n" "FAIL_OVER_MAC := none|active|follow\n" - "XMIT_HASH_POLICY := layer2|layer2+3|layer3+4|encap2+3|encap3+4\n" + "XMIT_HASH_POLICY := layer2|layer2+3|layer3+4|encap2+3|encap3+4|vlan+srcmac\n" "LACP_RATE := slow|fast\n" "AD_SELECT := stable|bandwidth|count\n" ); From 2ce313d1bb5b3fc8554b71e551b9f72ef7ac78d3 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 25 Jan 2021 11:40:55 +0100 Subject: [PATCH 14/32] iplink_can: add Classical CAN frame LEN8_DLC support The len8_dlc element is filled by the CAN interface driver and used for CAN frame creation by the CAN driver when the CAN_CTRLMODE_CC_LEN8_DLC flag is supported by the driver and enabled via netlink configuration interface. Add the command line support for cc-len8-dlc for Linux 5.11+ Signed-off-by: Oliver Hartkopp Signed-off-by: David Ahern --- ip/iplink_can.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ip/iplink_can.c b/ip/iplink_can.c index 735ab941..6a26f3ff 100644 --- a/ip/iplink_can.c +++ b/ip/iplink_can.c @@ -37,6 +37,7 @@ static void print_usage(FILE *f) "\t[ fd { on | off } ]\n" "\t[ fd-non-iso { on | off } ]\n" "\t[ presume-ack { on | off } ]\n" + "\t[ cc-len8-dlc { on | off } ]\n" "\n" "\t[ restart-ms TIME-MS ]\n" "\t[ restart ]\n" @@ -103,6 +104,7 @@ static void print_ctrlmode(FILE *f, __u32 cm) _PF(CAN_CTRLMODE_FD, "FD"); _PF(CAN_CTRLMODE_FD_NON_ISO, "FD-NON-ISO"); _PF(CAN_CTRLMODE_PRESUME_ACK, "PRESUME-ACK"); + _PF(CAN_CTRLMODE_CC_LEN8_DLC, "CC-LEN8-DLC"); #undef _PF if (cm) print_hex(PRINT_ANY, NULL, "%x", cm); @@ -211,6 +213,10 @@ static int can_parse_opt(struct link_util *lu, int argc, char **argv, NEXT_ARG(); set_ctrlmode("presume-ack", *argv, &cm, CAN_CTRLMODE_PRESUME_ACK); + } else if (matches(*argv, "cc-len8-dlc") == 0) { + NEXT_ARG(); + set_ctrlmode("cc-len8-dlc", *argv, &cm, + CAN_CTRLMODE_CC_LEN8_DLC); } else if (matches(*argv, "restart") == 0) { __u32 val = 1; From 1e6190218050a21475789bff7a9beef3c360f03c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 Feb 2021 01:58:51 +0000 Subject: [PATCH 15/32] Update kernel headers Update kernel headers to commit: 14e8e0f60088 ("tcp: shrink inet_connection_sock icsk_mtup enabled and probe_size") Signed-off-by: David Ahern --- include/uapi/linux/devlink.h | 25 +++++++++++++++++++++++++ include/uapi/linux/if_link.h | 2 ++ include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/rpl.h | 6 +++--- 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 958ef7b9..a430775d 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -200,6 +200,10 @@ enum devlink_port_flavour { DEVLINK_PORT_FLAVOUR_UNUSED, /* Port which exists in the switch, but * is not used in any way. */ + DEVLINK_PORT_FLAVOUR_PCI_SF, /* Represents eswitch port + * for the PCI SF. It is an internal + * port that faces the PCI SF. + */ }; enum devlink_param_cmode { @@ -529,6 +533,7 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_ACTION_INFO, /* nested */ DEVLINK_ATTR_RELOAD_ACTION_STATS, /* nested */ + DEVLINK_ATTR_PORT_PCI_SF_NUMBER, /* u32 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, @@ -578,9 +583,29 @@ enum devlink_resource_unit { enum devlink_port_function_attr { DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, /* binary */ + DEVLINK_PORT_FN_ATTR_STATE, /* u8 */ + DEVLINK_PORT_FN_ATTR_OPSTATE, /* u8 */ __DEVLINK_PORT_FUNCTION_ATTR_MAX, DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1 }; +enum devlink_port_fn_state { + DEVLINK_PORT_FN_STATE_INACTIVE, + DEVLINK_PORT_FN_STATE_ACTIVE, +}; + +/** + * enum devlink_port_fn_opstate - indicates operational state of the function + * @DEVLINK_PORT_FN_OPSTATE_ATTACHED: Driver is attached to the function. + * For graceful tear down of the function, after inactivation of the + * function, user should wait for operational state to turn DETACHED. + * @DEVLINK_PORT_FN_OPSTATE_DETACHED: Driver is detached from the function. + * It is safe to delete the port. + */ +enum devlink_port_fn_opstate { + DEVLINK_PORT_FN_OPSTATE_DETACHED, + DEVLINK_PORT_FN_OPSTATE_ATTACHED, +}; + #endif /* _LINUX_DEVLINK_H_ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index b4cf6679..c96880c5 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -523,6 +523,8 @@ enum { IFLA_BRPORT_BACKUP_PORT, IFLA_BRPORT_MRP_RING_OPEN, IFLA_BRPORT_MRP_IN_OPEN, + IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, + IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 709668e2..afe6836e 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -592,6 +592,7 @@ enum { TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */ TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */ + TCA_FLOWER_KEY_CT_FLAGS_REPLY = 1 << 5, /* Packet is in the reply direction. */ }; enum { diff --git a/include/uapi/linux/rpl.h b/include/uapi/linux/rpl.h index c24b64cd..72d60e09 100644 --- a/include/uapi/linux/rpl.h +++ b/include/uapi/linux/rpl.h @@ -28,10 +28,10 @@ struct ipv6_rpl_sr_hdr { pad:4, reserved1:16; #elif defined(__BIG_ENDIAN_BITFIELD) - __u32 reserved:20, + __u32 cmpri:4, + cmpre:4, pad:4, - cmpri:4, - cmpre:4; + reserved:20; #else #error "Please fix " #endif From a9642c5fa6e6b3bb871aea53947ec21679c7f3ea Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 1 Feb 2021 23:35:47 +0200 Subject: [PATCH 16/32] devlink: Introduce and use string to number mapper Instead of using static mapping in code, introduce a helper routine to map a value to string. Signed-off-by: Parav Pandit Signed-off-by: David Ahern --- devlink/devlink.c | 30 ++++++++++++++---------------- include/utils.h | 8 ++++++++ lib/utils.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 16 deletions(-) diff --git a/devlink/devlink.c b/devlink/devlink.c index a2e06644..d21a7c4d 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -1383,6 +1383,16 @@ static int reload_limit_get(struct dl *dl, const char *limitstr, return 0; } +static struct str_num_map port_flavour_map[] = { + { .str = "physical", .num = DEVLINK_PORT_FLAVOUR_PHYSICAL }, + { .str = "cpu", .num = DEVLINK_PORT_FLAVOUR_CPU }, + { .str = "dsa", .num = DEVLINK_PORT_FLAVOUR_DSA }, + { .str = "pcipf", .num = DEVLINK_PORT_FLAVOUR_PCI_PF }, + { .str = "pcivf", .num = DEVLINK_PORT_FLAVOUR_PCI_VF }, + { .str = "virtual", .num = DEVLINK_PORT_FLAVOUR_VIRTUAL}, + { .str = NULL, }, +}; + struct dl_args_metadata { uint64_t o_flag; char err_msg[DL_ARGS_REQUIRED_MAX_ERR_LEN]; @@ -3717,22 +3727,10 @@ static const char *port_type_name(uint32_t type) static const char *port_flavour_name(uint16_t flavour) { - switch (flavour) { - case DEVLINK_PORT_FLAVOUR_PHYSICAL: - return "physical"; - case DEVLINK_PORT_FLAVOUR_CPU: - return "cpu"; - case DEVLINK_PORT_FLAVOUR_DSA: - return "dsa"; - case DEVLINK_PORT_FLAVOUR_PCI_PF: - return "pcipf"; - case DEVLINK_PORT_FLAVOUR_PCI_VF: - return "pcivf"; - case DEVLINK_PORT_FLAVOUR_VIRTUAL: - return "virtual"; - default: - return ""; - } + const char *str; + + str = str_map_lookup_u16(port_flavour_map, flavour); + return str ? str : ""; } static void pr_out_port_pfvf_num(struct dl *dl, struct nlattr **tb) diff --git a/include/utils.h b/include/utils.h index f1403f73..1d67443e 100644 --- a/include/utils.h +++ b/include/utils.h @@ -340,4 +340,12 @@ int parse_mapping(int *argcp, char ***argvp, bool allow_all, int (*mapping_cb)(__u32 key, char *value, void *data), void *mapping_cb_data); +struct str_num_map { + const char *str; + int num; +}; + +int str_map_lookup_str(const struct str_num_map *map, const char *needle); +const char *str_map_lookup_u16(const struct str_num_map *map, uint16_t val); + #endif /* __UTILS_H__ */ diff --git a/lib/utils.c b/lib/utils.c index 90e58fa3..9fef2d76 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -1937,3 +1937,31 @@ int parse_mapping(int *argcp, char ***argvp, bool allow_all, return parse_mapping_gen(argcp, argvp, parse_mapping_num, mapping_cb, mapping_cb_data); } + +int str_map_lookup_str(const struct str_num_map *map, const char *needle) +{ + if (!needle) + return -EINVAL; + + /* Process array which is NULL terminated by the string. */ + while (map && map->str) { + if (strcmp(map->str, needle) == 0) + return map->num; + + map++; + } + return -EINVAL; +} + +const char *str_map_lookup_u16(const struct str_num_map *map, uint16_t val) +{ + int num = val; + + while (map && map->str) { + if (num == map->num) + return map->str; + + map++; + } + return NULL; +} From 836a1365b742a9a46f662bd942807de33e68e880 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 1 Feb 2021 23:35:48 +0200 Subject: [PATCH 17/32] devlink: Introduce PCI SF port flavour and attribute Introduce PCI SF port flavour and port attributes such as PF number and SF number. $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David Ahern --- devlink/devlink.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/devlink/devlink.c b/devlink/devlink.c index d21a7c4d..338cb035 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -1389,6 +1389,7 @@ static struct str_num_map port_flavour_map[] = { { .str = "dsa", .num = DEVLINK_PORT_FLAVOUR_DSA }, { .str = "pcipf", .num = DEVLINK_PORT_FLAVOUR_PCI_PF }, { .str = "pcivf", .num = DEVLINK_PORT_FLAVOUR_PCI_VF }, + { .str = "pcisf", .num = DEVLINK_PORT_FLAVOUR_PCI_SF }, { .str = "virtual", .num = DEVLINK_PORT_FLAVOUR_VIRTUAL}, { .str = NULL, }, }; @@ -3733,7 +3734,7 @@ static const char *port_flavour_name(uint16_t flavour) return str ? str : ""; } -static void pr_out_port_pfvf_num(struct dl *dl, struct nlattr **tb) +static void pr_out_port_pfvfsf_num(struct dl *dl, struct nlattr **tb) { uint16_t fn_num; @@ -3748,6 +3749,10 @@ static void pr_out_port_pfvf_num(struct dl *dl, struct nlattr **tb) fn_num = mnl_attr_get_u16(tb[DEVLINK_ATTR_PORT_PCI_VF_NUMBER]); print_uint(PRINT_ANY, "vfnum", " vfnum %u", fn_num); } + if (tb[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { + fn_num = mnl_attr_get_u32(tb[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); + print_uint(PRINT_ANY, "sfnum", " sfnum %u", fn_num); + } if (tb[DEVLINK_ATTR_PORT_EXTERNAL]) { uint8_t external; @@ -3825,7 +3830,8 @@ static void pr_out_port(struct dl *dl, struct nlattr **tb) switch (port_flavour) { case DEVLINK_PORT_FLAVOUR_PCI_PF: case DEVLINK_PORT_FLAVOUR_PCI_VF: - pr_out_port_pfvf_num(dl, tb); + case DEVLINK_PORT_FLAVOUR_PCI_SF: + pr_out_port_pfvfsf_num(dl, tb); break; default: break; From 331bf89ad08421750ad80688192b9831b02c6361 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 1 Feb 2021 23:35:49 +0200 Subject: [PATCH 18/32] devlink: Supporting add and delete of devlink port Enable user to add and delete the devlink port. Examples for adding and deleting one SF port: Examples of add, show and delete commands: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false Add devlink port of flavour 'pcipf' for PF number 0 SF number 88: $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached Delete newly added devlink port $ devlink port del pci/0000:06:00.0/32768 Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David Ahern --- devlink/devlink.c | 108 ++++++++++++++++++++++++++++++++++++++++ man/man8/devlink-port.8 | 63 +++++++++++++++++++++++ 2 files changed, 171 insertions(+) diff --git a/devlink/devlink.c b/devlink/devlink.c index 338cb035..76ea7cac 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -306,6 +306,9 @@ static void ifname_map_free(struct ifname_map *ifname_map) #define DL_OPT_FLASH_OVERWRITE BIT(39) #define DL_OPT_RELOAD_ACTION BIT(40) #define DL_OPT_RELOAD_LIMIT BIT(41) +#define DL_OPT_PORT_FLAVOUR BIT(42) +#define DL_OPT_PORT_PFNUMBER BIT(43) +#define DL_OPT_PORT_SFNUMBER BIT(44) struct dl_opts { uint64_t present; /* flags of present items */ @@ -356,6 +359,9 @@ struct dl_opts { uint32_t overwrite_mask; enum devlink_reload_action reload_action; enum devlink_reload_limit reload_limit; + uint32_t port_sfnumber; + uint16_t port_flavour; + uint16_t port_pfnumber; }; struct dl { @@ -1394,6 +1400,17 @@ static struct str_num_map port_flavour_map[] = { { .str = NULL, }, }; +static int port_flavour_parse(const char *flavour, uint16_t *value) +{ + int num; + + num = str_map_lookup_str(port_flavour_map, flavour); + if (num < 0) + return num; + *value = num; + return 0; +} + struct dl_args_metadata { uint64_t o_flag; char err_msg[DL_ARGS_REQUIRED_MAX_ERR_LEN]; @@ -1425,6 +1442,8 @@ static const struct dl_args_metadata dl_args_required[] = { {DL_OPT_TRAP_NAME, "Trap's name is expected."}, {DL_OPT_TRAP_GROUP_NAME, "Trap group's name is expected."}, {DL_OPT_PORT_FUNCTION_HW_ADDR, "Port function's hardware address is expected."}, + {DL_OPT_PORT_FLAVOUR, "Port flavour is expected."}, + {DL_OPT_PORT_PFNUMBER, "Port PCI PF number is expected."}, }; static int dl_args_finding_required_validate(uint64_t o_required, @@ -1843,7 +1862,29 @@ static int dl_argv_parse(struct dl *dl, uint64_t o_required, if (err) return err; o_found |= DL_OPT_PORT_FUNCTION_HW_ADDR; + } else if (dl_argv_match(dl, "flavour") && (o_all & DL_OPT_PORT_FLAVOUR)) { + const char *flavourstr; + dl_arg_inc(dl); + err = dl_argv_str(dl, &flavourstr); + if (err) + return err; + err = port_flavour_parse(flavourstr, &opts->port_flavour); + if (err) + return err; + o_found |= DL_OPT_PORT_FLAVOUR; + } else if (dl_argv_match(dl, "pfnum") && (o_all & DL_OPT_PORT_PFNUMBER)) { + dl_arg_inc(dl); + err = dl_argv_uint16_t(dl, &opts->port_pfnumber); + if (err) + return err; + o_found |= DL_OPT_PORT_PFNUMBER; + } else if (dl_argv_match(dl, "sfnum") && (o_all & DL_OPT_PORT_SFNUMBER)) { + dl_arg_inc(dl); + err = dl_argv_uint32_t(dl, &opts->port_sfnumber); + if (err) + return err; + o_found |= DL_OPT_PORT_SFNUMBER; } else { pr_err("Unknown option \"%s\"\n", dl_argv(dl)); return -EINVAL; @@ -2026,6 +2067,12 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl) opts->trap_policer_burst); if (opts->present & DL_OPT_PORT_FUNCTION_HW_ADDR) dl_function_attr_put(nlh, opts); + if (opts->present & DL_OPT_PORT_FLAVOUR) + mnl_attr_put_u16(nlh, DEVLINK_ATTR_PORT_FLAVOUR, opts->port_flavour); + if (opts->present & DL_OPT_PORT_PFNUMBER) + mnl_attr_put_u16(nlh, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, opts->port_pfnumber); + if (opts->present & DL_OPT_PORT_SFNUMBER) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, opts->port_sfnumber); } static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl, @@ -3713,6 +3760,8 @@ static void cmd_port_help(void) pr_err(" devlink port unsplit DEV/PORT_INDEX\n"); pr_err(" devlink port function set DEV/PORT_INDEX [ hw_addr ADDR ]\n"); pr_err(" devlink port health show [ DEV/PORT_INDEX reporter REPORTER_NAME ]\n"); + pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); + pr_err(" devlink port del DEV/PORT_INDEX\n"); } static const char *port_type_name(uint32_t type) @@ -3974,6 +4023,58 @@ static int cmd_port_function(struct dl *dl) static int cmd_health(struct dl *dl); static int __cmd_health_show(struct dl *dl, bool show_device, bool show_port); +static void cmd_port_add_help(void) +{ + pr_err(" devlink port add { DEV | DEV/PORT_INDEX } flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); +} + +static int cmd_port_add(struct dl *dl) +{ + struct nlmsghdr *nlh; + int err; + + if (dl_argv_match(dl, "help") || dl_no_arg(dl)) { + cmd_port_add_help(); + return 0; + } + + nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_PORT_NEW, + NLM_F_REQUEST | NLM_F_ACK); + + err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE | DL_OPT_HANDLEP | + DL_OPT_PORT_FLAVOUR | DL_OPT_PORT_PFNUMBER, + DL_OPT_PORT_SFNUMBER); + if (err) + return err; + + return _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_port_show_cb, dl); +} + +static void cmd_port_del_help(void) +{ + pr_err(" devlink port del DEV/PORT_INDEX\n"); +} + +static int cmd_port_del(struct dl *dl) +{ + struct nlmsghdr *nlh; + int err; + + if (dl_argv_match(dl, "help") || dl_no_arg(dl)) { + cmd_port_del_help(); + return 0; + } + + nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_PORT_DEL, + NLM_F_REQUEST | NLM_F_ACK); + + err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLEP, 0); + if (err) + return err; + + return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL); +} + static int cmd_port(struct dl *dl) { if (dl_argv_match(dl, "help")) { @@ -4004,7 +4105,14 @@ static int cmd_port(struct dl *dl) } else { return cmd_health(dl); } + } else if (dl_argv_match(dl, "add")) { + dl_arg_inc(dl); + return cmd_port_add(dl); + } else if (dl_argv_match(dl, "del")) { + dl_arg_inc(dl); + return cmd_port_del(dl); } + pr_err("Command \"%s\" not found\n", dl_argv(dl)); return -ENOENT; } diff --git a/man/man8/devlink-port.8 b/man/man8/devlink-port.8 index 966faae6..4a1d3800 100644 --- a/man/man8/devlink-port.8 +++ b/man/man8/devlink-port.8 @@ -43,6 +43,23 @@ devlink-port \- devlink port configuration .B devlink port health .RI "{ " show " | " recover " | " diagnose " | " dump " | " set " }" +.ti -8 +.BI "devlink port add" +.RB "[" +.IR "DEV | DEV/PORT_INDEX" +.RB "] " +.RB "[ " flavour +.IR FLAVOUR " ]" +.RB "[ " pcipf +.IR PFNUMBER " ]" +.RB "{ " pcisf +.IR SFNUMBER " }" +.br + +.ti -8 +.B devlink port del +.IR DEV/PORT_INDEX + .ti -8 .B devlink port help @@ -99,6 +116,42 @@ If this argument is omitted all ports are listed. Is an alias for .BR devlink-health (8). +.ti -8 +.SS devlink port add - add a devlink port +.PP +.B "DEV" +- specifies the devlink device to operate on. or + +.PP +.B "DEV/PORT_INDEX" +- specifies the devlink port index to use for the requested new port. +This is optional. When ommited, driver allocates unique port index. + +.TP +.BR flavour " { " pcipf " | " pcisf " } " +set port flavour + +.I pcipf +- PCI PF port + +.I pcisf +- PCI SF port + +.TP +.BR pfnum " { " pfnumber " } " +Specifies PCI pfnumber to use on which a SF device to create + +.TP +.BR sfnum " { " sfnumber " } " +Specifies sfnumber to assign to the device of the SF. +This field is optional for those devices which supports auto assignment of the SF number. + +.ti -8 +.SS devlink port del - delete a devlink port +.PP +.B "DEV/PORT_INDEX" +- specifies the devlink port to delete. + .SH "EXAMPLES" .PP devlink port show @@ -135,6 +188,16 @@ devlink port health show pci/0000:01:00.0/1 reporter tx .RS 4 Shows status and configuration of tx reporter registered on pci/0000:01:00.0/1 devlink port. .RE +.PP +devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 +.RS 4 +Add a devlink port of flavour PCI SF on PCI PF having number 0 with SF number 88. +.RE +.PP +devlink port del pci/0000:06:00.0/1 +.RS 4 +Delete previously created devlink port. +.RE .SH SEE ALSO .BR devlink (8), From 249465d3bf4eaa6186ff940160d355730a086e97 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 1 Feb 2021 23:35:50 +0200 Subject: [PATCH 19/32] devlink: Support get port function state Print port function state and operational state whenever reported by kernel. Example of a PCI SF port function which supports the state: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "inactive", "opstate": "detached" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David Ahern --- devlink/devlink.c | 61 ++++++++++++++++++++++++++++++++++++++++------- include/utils.h | 1 + lib/utils.c | 13 ++++++++++ 3 files changed, 67 insertions(+), 8 deletions(-) diff --git a/devlink/devlink.c b/devlink/devlink.c index 76ea7cac..17db8623 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -1400,6 +1400,18 @@ static struct str_num_map port_flavour_map[] = { { .str = NULL, }, }; +static struct str_num_map port_fn_state_map[] = { + { .str = "inactive", .num = DEVLINK_PORT_FN_STATE_INACTIVE}, + { .str = "active", .num = DEVLINK_PORT_FN_STATE_ACTIVE }, + { .str = NULL, } +}; + +static struct str_num_map port_fn_opstate_map[] = { + { .str = "attached", .num = DEVLINK_PORT_FN_OPSTATE_ATTACHED}, + { .str = "detached", .num = DEVLINK_PORT_FN_OPSTATE_DETACHED}, + { .str = NULL, } +}; + static int port_flavour_parse(const char *flavour, uint16_t *value) { int num; @@ -3810,6 +3822,22 @@ static void pr_out_port_pfvfsf_num(struct dl *dl, struct nlattr **tb) } } +static const char *port_fn_state(uint8_t state) +{ + const char *str; + + str = str_map_lookup_u8(port_fn_state_map, state); + return str ? str : ""; +} + +static const char *port_fn_opstate(uint8_t state) +{ + const char *str; + + str = str_map_lookup_u8(port_fn_opstate_map, state); + return str ? str : ""; +} + static void pr_out_port_function(struct dl *dl, struct nlattr **tb_port) { struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {}; @@ -3826,16 +3854,33 @@ static void pr_out_port_function(struct dl *dl, struct nlattr **tb_port) if (err != MNL_CB_OK) return; - if (!tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]) - return; - - len = mnl_attr_get_payload_len(tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]); - data = mnl_attr_get_payload(tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]); - pr_out_object_start(dl, "function"); check_indent_newline(dl); - print_string(PRINT_ANY, "hw_addr", "hw_addr %s", - ll_addr_n2a(data, len, 0, hw_addr, sizeof(hw_addr))); + + if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]) { + len = mnl_attr_get_payload_len(tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]); + data = mnl_attr_get_payload(tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]); + + print_string(PRINT_ANY, "hw_addr", "hw_addr %s", + ll_addr_n2a(data, len, 0, hw_addr, sizeof(hw_addr))); + } + if (tb[DEVLINK_PORT_FN_ATTR_STATE]) { + uint8_t state; + + state = mnl_attr_get_u8(tb[DEVLINK_PORT_FN_ATTR_STATE]); + + print_string(PRINT_ANY, "state", " state %s", + port_fn_state(state)); + } + if (tb[DEVLINK_PORT_FN_ATTR_OPSTATE]) { + uint8_t state; + + state = mnl_attr_get_u8(tb[DEVLINK_PORT_FN_ATTR_OPSTATE]); + + print_string(PRINT_ANY, "opstate", " opstate %s", + port_fn_opstate(state)); + } + if (!dl->json_output) __pr_out_indent_dec(); pr_out_object_end(dl); diff --git a/include/utils.h b/include/utils.h index 1d67443e..e66090ae 100644 --- a/include/utils.h +++ b/include/utils.h @@ -347,5 +347,6 @@ struct str_num_map { int str_map_lookup_str(const struct str_num_map *map, const char *needle); const char *str_map_lookup_u16(const struct str_num_map *map, uint16_t val); +const char *str_map_lookup_u8(const struct str_num_map *map, uint8_t val); #endif /* __UTILS_H__ */ diff --git a/lib/utils.c b/lib/utils.c index 9fef2d76..af1b553c 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -1965,3 +1965,16 @@ const char *str_map_lookup_u16(const struct str_num_map *map, uint16_t val) } return NULL; } + +const char *str_map_lookup_u8(const struct str_num_map *map, uint8_t val) +{ + int num = val; + + while (map && map->str) { + if (num == map->num) + return map->str; + + map++; + } + return NULL; +} From bdfb9f1bd61a6aa92e2d832a13452182cde07155 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 1 Feb 2021 23:35:51 +0200 Subject: [PATCH 20/32] devlink: Support set of port function state Support set operation of the devlink port function state. Example of a PCI SF port function which supports the state: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David Ahern --- devlink/devlink.c | 51 ++++++++++++++++++++++++++----- man/man8/devlink-port.8 | 68 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 110 insertions(+), 9 deletions(-) diff --git a/devlink/devlink.c b/devlink/devlink.c index 17db8623..10398f77 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -309,6 +309,7 @@ static void ifname_map_free(struct ifname_map *ifname_map) #define DL_OPT_PORT_FLAVOUR BIT(42) #define DL_OPT_PORT_PFNUMBER BIT(43) #define DL_OPT_PORT_SFNUMBER BIT(44) +#define DL_OPT_PORT_FUNCTION_STATE BIT(45) struct dl_opts { uint64_t present; /* flags of present items */ @@ -362,6 +363,7 @@ struct dl_opts { uint32_t port_sfnumber; uint16_t port_flavour; uint16_t port_pfnumber; + uint8_t port_fn_state; }; struct dl { @@ -747,6 +749,7 @@ static int attr_stats_cb(const struct nlattr *attr, void *data) static const enum mnl_attr_data_type devlink_function_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR ] = MNL_TYPE_BINARY, + [DEVLINK_PORT_FN_ATTR_STATE] = MNL_TYPE_U8, }; static int function_attr_cb(const struct nlattr *attr, void *data) @@ -1423,6 +1426,17 @@ static int port_flavour_parse(const char *flavour, uint16_t *value) return 0; } +static int port_fn_state_parse(const char *statestr, uint8_t *state) +{ + int num; + + num = str_map_lookup_str(port_fn_state_map, statestr); + if (num < 0) + return num; + *state = num; + return 0; +} + struct dl_args_metadata { uint64_t o_flag; char err_msg[DL_ARGS_REQUIRED_MAX_ERR_LEN]; @@ -1874,6 +1888,19 @@ static int dl_argv_parse(struct dl *dl, uint64_t o_required, if (err) return err; o_found |= DL_OPT_PORT_FUNCTION_HW_ADDR; + } else if (dl_argv_match(dl, "state") && + (o_all & DL_OPT_PORT_FUNCTION_STATE)) { + const char *statestr; + + dl_arg_inc(dl); + err = dl_argv_str(dl, &statestr); + if (err) + return err; + err = port_fn_state_parse(statestr, &opts->port_fn_state); + if (err) + return err; + + o_found |= DL_OPT_PORT_FUNCTION_STATE; } else if (dl_argv_match(dl, "flavour") && (o_all & DL_OPT_PORT_FLAVOUR)) { const char *flavourstr; @@ -1919,9 +1946,14 @@ dl_function_attr_put(struct nlmsghdr *nlh, const struct dl_opts *opts) struct nlattr *nest; nest = mnl_attr_nest_start(nlh, DEVLINK_ATTR_PORT_FUNCTION); - mnl_attr_put(nlh, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, - opts->port_function_hw_addr_len, - opts->port_function_hw_addr); + + if (opts->present & DL_OPT_PORT_FUNCTION_HW_ADDR) + mnl_attr_put(nlh, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, + opts->port_function_hw_addr_len, + opts->port_function_hw_addr); + if (opts->present & DL_OPT_PORT_FUNCTION_STATE) + mnl_attr_put_u8(nlh, DEVLINK_PORT_FN_ATTR_STATE, + opts->port_fn_state); mnl_attr_nest_end(nlh, nest); } @@ -2077,7 +2109,7 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl) if (opts->present & DL_OPT_TRAP_POLICER_BURST) mnl_attr_put_u64(nlh, DEVLINK_ATTR_TRAP_POLICER_BURST, opts->trap_policer_burst); - if (opts->present & DL_OPT_PORT_FUNCTION_HW_ADDR) + if (opts->present & (DL_OPT_PORT_FUNCTION_HW_ADDR | DL_OPT_PORT_FUNCTION_STATE)) dl_function_attr_put(nlh, opts); if (opts->present & DL_OPT_PORT_FLAVOUR) mnl_attr_put_u16(nlh, DEVLINK_ATTR_PORT_FLAVOUR, opts->port_flavour); @@ -3770,7 +3802,7 @@ static void cmd_port_help(void) pr_err(" devlink port set DEV/PORT_INDEX [ type { eth | ib | auto} ]\n"); pr_err(" devlink port split DEV/PORT_INDEX count COUNT\n"); pr_err(" devlink port unsplit DEV/PORT_INDEX\n"); - pr_err(" devlink port function set DEV/PORT_INDEX [ hw_addr ADDR ]\n"); + pr_err(" devlink port function set DEV/PORT_INDEX [ hw_addr ADDR ] [ state STATE ]\n"); pr_err(" devlink port health show [ DEV/PORT_INDEX reporter REPORTER_NAME ]\n"); pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); pr_err(" devlink port del DEV/PORT_INDEX\n"); @@ -4035,7 +4067,7 @@ static int cmd_port_unsplit(struct dl *dl) static void cmd_port_function_help(void) { - pr_err("Usage: devlink port function set DEV/PORT_INDEX [ hw_addr ADDR ]\n"); + pr_err("Usage: devlink port function set DEV/PORT_INDEX [ hw_addr ADDR ] [ state STATE ]\n"); } static int cmd_port_function_set(struct dl *dl) @@ -4043,9 +4075,14 @@ static int cmd_port_function_set(struct dl *dl) struct nlmsghdr *nlh; int err; + if (dl_no_arg(dl)) { + cmd_port_function_help(); + return 0; + } nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_PORT_SET, NLM_F_REQUEST | NLM_F_ACK); - err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLEP | DL_OPT_PORT_FUNCTION_HW_ADDR, 0); + err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLEP, + DL_OPT_PORT_FUNCTION_HW_ADDR | DL_OPT_PORT_FUNCTION_STATE); if (err) return err; diff --git a/man/man8/devlink-port.8 b/man/man8/devlink-port.8 index 4a1d3800..55f1cce6 100644 --- a/man/man8/devlink-port.8 +++ b/man/man8/devlink-port.8 @@ -60,6 +60,16 @@ devlink-port \- devlink port configuration .B devlink port del .IR DEV/PORT_INDEX +.ti -8 +.BR "devlink port function set " +.IR DEV/PORT_INDEX +.RI "{ " +.BR "hw_addr " +.RI "ADDR }" +.RI "{ " +.BR "state" +.RI "STATE }" + .ti -8 .B devlink port help @@ -144,7 +154,30 @@ Specifies PCI pfnumber to use on which a SF device to create .TP .BR sfnum " { " sfnumber " } " Specifies sfnumber to assign to the device of the SF. -This field is optional for those devices which supports auto assignment of the SF number. +This field is optional for those devices which supports auto assignment of the +SF number. + +.ti -8 +.SS devlink port function set - Set the port function attribute(s). + +.PP +.B "DEV/PORT_INDEX" +- specifies the devlink port to operate on. + +.TP +.BR hw_addr " ADDR" +- hardware address of the function to set. This is a Ethernet MAC address when +port type is Ethernet. + +.TP +.BR state " { " active " | " inactive " } " +- new state of the function to change to. + +.I active +- Once configuration of the function is done, activate the function. + +.I inactive +- To inactivate the function and its device(s), set to inactive. .ti -8 .SS devlink port del - delete a devlink port @@ -192,11 +225,42 @@ Shows status and configuration of tx reporter registered on pci/0000:01:00.0/1 d devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 .RS 4 Add a devlink port of flavour PCI SF on PCI PF having number 0 with SF number 88. +To make use of the function an example sequence is to add a port, configure the +function attribute and activate the function. Once function usage is completed, +inactivate the function and finally delete the port. When there is desire to +reuse the port without deletion, it can be reconfigured and activated again when +function is in inactive state and function's operational state is detached. .RE .PP devlink port del pci/0000:06:00.0/1 .RS 4 -Delete previously created devlink port. +Delete previously created devlink port. It is recommended to first deactivate +the function if the function supports state management. +.RE +.PP +devlink port function set pci/0000:01:00.0/1 hw_addr 00:00:00:11:22:33 +.RS 4 +Configure hardware address of the PCI function represented by devlink port. +If the port supports change in function state, hardware address must be configured +before activating the function. +.RE +.PP +devlink port function set pci/0000:01:00.0/1 state active +.RS 4 +Activate the function. This will initiate the function enumeration and driver loading. +.RE +.PP +devlink port function set pci/0000:01:00.0/1 state inactive +.RS 4 +Deactivate the function. This will initiate the function teardown which results +in driver unload and device removal. +.RE +.PP +devlink port function set pci/0000:01:00.0/1 hw_addr 00:00:00:11:22:33 state active +.RS 4 +Configure hardware address and also active the function. When a function is +activated together with other configuration in a single command, all the +configuration is applied first before changing the state to active. .RE .SH SEE ALSO From b7e5002456df3dd68830369b6bb9386e0a171d55 Mon Sep 17 00:00:00 2001 From: Thayne McCombs Date: Mon, 1 Feb 2021 20:32:10 -0700 Subject: [PATCH 21/32] ss: always prefer family as part of host condition to default family ss accepts an address family both with the -f option and as part of a host condition. However, if the family in the host condition is different than the the last -f option, then which family is actually used depends on the order that different families are checked. This changes parse_hostcond to check all family prefixes before parsing the rest of the address, so that the host condition's family always has a higher priority than the "preferred" family. Signed-off-by: Thayne McCombs Signed-off-by: David Ahern --- misc/ss.c | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/misc/ss.c b/misc/ss.c index 0593627b..aefa1c2f 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -2119,24 +2119,39 @@ void *parse_hostcond(char *addr, bool is_port) int fam = preferred_family; struct filter *f = ¤t_filter; - if (fam == AF_UNIX || strncmp(addr, "unix:", 5) == 0) { + if (strncmp(addr, "unix:", 5) == 0) { + fam = AF_UNIX; + addr += 5; + } else if (strncmp(addr, "link:", 5) == 0) { + fam = AF_PACKET; + addr += 5; + } else if (strncmp(addr, "netlink:", 8) == 0) { + fam = AF_NETLINK; + addr += 8; + } else if (strncmp(addr, "vsock:", 6) == 0) { + fam = AF_VSOCK; + addr += 6; + } else if (strncmp(addr, "inet:", 5) == 0) { + fam = AF_INET; + addr += 5; + } else if (strncmp(addr, "inet6:", 6) == 0) { + fam = AF_INET6; + addr += 6; + } + + if (fam == AF_UNIX) { char *p; a.addr.family = AF_UNIX; - if (strncmp(addr, "unix:", 5) == 0) - addr += 5; p = strdup(addr); a.addr.bitlen = 8*strlen(p); memcpy(a.addr.data, &p, sizeof(p)); - fam = AF_UNIX; goto out; } - if (fam == AF_PACKET || strncmp(addr, "link:", 5) == 0) { + if (fam == AF_PACKET) { a.addr.family = AF_PACKET; a.addr.bitlen = 0; - if (strncmp(addr, "link:", 5) == 0) - addr += 5; port = strchr(addr, ':'); if (port) { *port = 0; @@ -2155,15 +2170,12 @@ void *parse_hostcond(char *addr, bool is_port) return NULL; a.addr.data[0] = ntohs(tmp); } - fam = AF_PACKET; goto out; } - if (fam == AF_NETLINK || strncmp(addr, "netlink:", 8) == 0) { + if (fam == AF_NETLINK) { a.addr.family = AF_NETLINK; a.addr.bitlen = 0; - if (strncmp(addr, "netlink:", 8) == 0) - addr += 8; port = strchr(addr, ':'); if (port) { *port = 0; @@ -2181,16 +2193,13 @@ void *parse_hostcond(char *addr, bool is_port) if (nl_proto_a2n(&a.addr.data[0], addr) == -1) return NULL; } - fam = AF_NETLINK; goto out; } - if (fam == AF_VSOCK || strncmp(addr, "vsock:", 6) == 0) { + if (fam == AF_VSOCK) { __u32 cid = ~(__u32)0; a.addr.family = AF_VSOCK; - if (strncmp(addr, "vsock:", 6) == 0) - addr += 6; if (is_port) port = addr; @@ -2212,20 +2221,9 @@ void *parse_hostcond(char *addr, bool is_port) return NULL; } vsock_set_inet_prefix(&a.addr, cid); - fam = AF_VSOCK; goto out; } - if (fam == AF_INET || !strncmp(addr, "inet:", 5)) { - fam = AF_INET; - if (!strncmp(addr, "inet:", 5)) - addr += 5; - } else if (fam == AF_INET6 || !strncmp(addr, "inet6:", 6)) { - fam = AF_INET6; - if (!strncmp(addr, "inet6:", 6)) - addr += 6; - } - /* URL-like literal [] */ if (addr[0] == '[') { addr++; From b8b8b6d4c986fdf9fbd39eca1a04909abfac209b Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 4 Feb 2021 16:51:37 +0200 Subject: [PATCH 22/32] tc/htb: Hierarchical QoS hardware offload This commit adds support for configuring HTB in offload mode. HTB offload eliminates the single qdisc lock in the datapath and offloads the algorithm to the NIC. The new 'offload' parameter is added to enable this mode: # tc qdisc replace dev eth0 root handle 1: htb offload Classes are created as usual, but filters should be moved to clsact for lock-free classification (filters attached to HTB itself are not supported in the offload mode): # tc filter add dev eth0 egress protocol ip flower dst_port 80 action skbedit priority 1:10 tc qdisc show and tc class show will indicate whether the offload is enabled. Example output: $ tc qdisc show dev eth1 qdisc htb 1: root offloaded r2q 10 default 0 direct_packets_stat 0 direct_qlen 1000 offload qdisc pfifo 0: parent 1: limit 1000p qdisc pfifo 0: parent 1: limit 1000p qdisc pfifo 0: parent 1: limit 1000p qdisc pfifo 0: parent 1: limit 1000p qdisc pfifo 0: parent 1: limit 1000p qdisc pfifo 0: parent 1: limit 1000p qdisc pfifo 0: parent 1: limit 1000p qdisc pfifo 0: parent 1: limit 1000p $ tc class show dev eth1 class htb 1:101 parent 1:1 prio 0 rate 4Gbit ceil 4Gbit burst 1000b cburst 1000b offload class htb 1:1 root rate 100Gbit ceil 100Gbit burst 0b cburst 0b offload class htb 1:103 parent 1:1 prio 0 rate 4Gbit ceil 4Gbit burst 1000b cburst 1000b offload class htb 1:102 parent 1:1 prio 0 rate 4Gbit ceil 4Gbit burst 1000b cburst 1000b offload class htb 1:105 parent 1:1 prio 0 rate 4Gbit ceil 4Gbit burst 1000b cburst 1000b offload class htb 1:104 parent 1:1 prio 0 rate 4Gbit ceil 4Gbit burst 1000b cburst 1000b offload class htb 1:107 parent 1:1 prio 0 rate 4Gbit ceil 4Gbit burst 1000b cburst 1000b offload class htb 1:106 parent 1:1 prio 0 rate 4Gbit ceil 4Gbit burst 1000b cburst 1000b offload class htb 1:108 parent 1:1 prio 0 rate 4Gbit ceil 4Gbit burst 1000b cburst 1000b offload $ tc -j qdisc show dev eth1 [{"kind":"htb","handle":"1:","root":true,"offloaded":true,"options":{"r2q":10,"default":"0","direct_packets_stat":0,"direct_qlen":1000,"offload":null}},{"kind":"pfifo","handle":"0:","parent":"1:","options":{"limit":1000}},{"kind":"pfifo","handle":"0:","parent":"1:","options":{"limit":1000}},{"kind":"pfifo","handle":"0:","parent":"1:","options":{"limit":1000}},{"kind":"pfifo","handle":"0:","parent":"1:","options":{"limit":1000}},{"kind":"pfifo","handle":"0:","parent":"1:","options":{"limit":1000}},{"kind":"pfifo","handle":"0:","parent":"1:","options":{"limit":1000}},{"kind":"pfifo","handle":"0:","parent":"1:","options":{"limit":1000}},{"kind":"pfifo","handle":"0:","parent":"1:","options":{"limit":1000}}] Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: David Ahern --- man/man8/tc-htb.8 | 5 ++++- tc/q_htb.c | 10 +++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/man/man8/tc-htb.8 b/man/man8/tc-htb.8 index a4162342..031b73ac 100644 --- a/man/man8/tc-htb.8 +++ b/man/man8/tc-htb.8 @@ -12,7 +12,7 @@ major: minor-id .B ] [ r2q divisor -.B ] +.B ] [ offload ] .B tc class ... dev dev @@ -104,6 +104,9 @@ Divisor used to calculate values for classes. Classes divide .B rate by this number. Default value is 10. +.TP +offload +Offload the HTB algorithm to hardware (requires driver and device support). .SH CLASSES Classes have a host of parameters to configure their operation. diff --git a/tc/q_htb.c b/tc/q_htb.c index c609e974..42566355 100644 --- a/tc/q_htb.c +++ b/tc/q_htb.c @@ -30,11 +30,12 @@ static void explain(void) { fprintf(stderr, "Usage: ... qdisc add ... htb [default N] [r2q N]\n" - " [direct_qlen P]\n" + " [direct_qlen P] [offload]\n" " default minor id of class to which unclassified packets are sent {0}\n" " r2q DRR quantums are computed as rate in Bps/r2q {10}\n" " debug string of 16 numbers each 0-3 {0}\n\n" " direct_qlen Limit of the direct queue {in packets}\n" + " offload enable hardware offload\n" "... class add ... htb rate R1 [burst B1] [mpu B] [overhead O]\n" " [prio P] [slot S] [pslot PS]\n" " [ceil R2] [cburst B2] [mtu MTU] [quantum Q]\n" @@ -68,6 +69,7 @@ static int htb_parse_opt(struct qdisc_util *qu, int argc, }; struct rtattr *tail; unsigned int i; char *p; + bool offload = false; while (argc > 0) { if (matches(*argv, "r2q") == 0) { @@ -91,6 +93,8 @@ static int htb_parse_opt(struct qdisc_util *qu, int argc, if (get_u32(&direct_qlen, *argv, 10)) { explain1("direct_qlen"); return -1; } + } else if (matches(*argv, "offload") == 0) { + offload = true; } else { fprintf(stderr, "What is \"%s\"?\n", *argv); explain(); @@ -103,6 +107,8 @@ static int htb_parse_opt(struct qdisc_util *qu, int argc, if (direct_qlen != ~0U) addattr_l(n, 2024, TCA_HTB_DIRECT_QLEN, &direct_qlen, sizeof(direct_qlen)); + if (offload) + addattr(n, 2024, TCA_HTB_OFFLOAD); addattr_nest_end(n, tail); return 0; } @@ -344,6 +350,8 @@ static int htb_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) print_uint(PRINT_ANY, "direct_qlen", " direct_qlen %u", direct_qlen); } + if (tb[TCA_HTB_OFFLOAD]) + print_null(PRINT_ANY, "offload", " offload", NULL); return 0; } From 049708a002960e89f13002d06b3c378ae7ecacb3 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 2 Feb 2021 14:24:42 +0200 Subject: [PATCH 23/32] tc: flower: Add support for ct_state reply flag Matches on conntrack rpl ct_state. Example: $ tc filter add dev ens1f0_0 ingress prio 1 chain 1 proto ip flower \ ct_state +trk+est+rpl \ action mirred egress redirect dev ens1f0_1 $ tc filter add dev ens1f0_1 ingress prio 1 chain 1 proto ip flower \ ct_state +trk+est-rpl \ action mirred egress redirect dev ens1f0_0 Signed-off-by: Paul Blakey Signed-off-by: David Ahern --- man/man8/tc-flower.8 | 2 ++ tc/f_flower.c | 1 + 2 files changed, 3 insertions(+) diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8 index 226d1cc6..f7336b62 100644 --- a/man/man8/tc-flower.8 +++ b/man/man8/tc-flower.8 @@ -387,6 +387,8 @@ new - New connection. .TP est - Established connection. .TP +rpl - The packet is in the reply direction, meaning that it is in the opposite direction from the packet that initiated the connection. +.TP inv - The state is invalid. The packet couldn't be associated to a connection. .TP Example: +trk+est diff --git a/tc/f_flower.c b/tc/f_flower.c index 85c1043a..53822a95 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -346,6 +346,7 @@ static struct flower_ct_states { { "new", TCA_FLOWER_KEY_CT_FLAGS_NEW }, { "est", TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED }, { "inv", TCA_FLOWER_KEY_CT_FLAGS_INVALID }, + { "rpl", TCA_FLOWER_KEY_CT_FLAGS_REPLY }, }; static int flower_parse_ct_state(char *str, struct nlmsghdr *n) From 5a6bf92a95f58d025037aeb0dab4a4a654cda430 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 10 Feb 2021 20:34:41 +0200 Subject: [PATCH 24/32] Add kernel headers Add kernel headers to commit from kernel tree [1]. 6acba4951632 ("vdpa_sim_net: Add support for user supported devices") [1] https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git Signed-off-by: Parav Pandit Signed-off-by: David Ahern --- vdpa/include/uapi/linux/vdpa.h | 40 +++++++++++++++++++ vdpa/include/uapi/linux/virtio_ids.h | 58 ++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 vdpa/include/uapi/linux/vdpa.h create mode 100644 vdpa/include/uapi/linux/virtio_ids.h diff --git a/vdpa/include/uapi/linux/vdpa.h b/vdpa/include/uapi/linux/vdpa.h new file mode 100644 index 00000000..37ae26b6 --- /dev/null +++ b/vdpa/include/uapi/linux/vdpa.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * vdpa device management interface + * Copyright (c) 2020 Mellanox Technologies Ltd. All rights reserved. + */ + +#ifndef _LINUX_VDPA_H_ +#define _LINUX_VDPA_H_ + +#define VDPA_GENL_NAME "vdpa" +#define VDPA_GENL_VERSION 0x1 + +enum vdpa_command { + VDPA_CMD_UNSPEC, + VDPA_CMD_MGMTDEV_NEW, + VDPA_CMD_MGMTDEV_GET, /* can dump */ + VDPA_CMD_DEV_NEW, + VDPA_CMD_DEV_DEL, + VDPA_CMD_DEV_GET, /* can dump */ +}; + +enum vdpa_attr { + VDPA_ATTR_UNSPEC, + + /* bus name (optional) + dev name together make the parent device handle */ + VDPA_ATTR_MGMTDEV_BUS_NAME, /* string */ + VDPA_ATTR_MGMTDEV_DEV_NAME, /* string */ + VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, /* u64 */ + + VDPA_ATTR_DEV_NAME, /* string */ + VDPA_ATTR_DEV_ID, /* u32 */ + VDPA_ATTR_DEV_VENDOR_ID, /* u32 */ + VDPA_ATTR_DEV_MAX_VQS, /* u32 */ + VDPA_ATTR_DEV_MAX_VQ_SIZE, /* u16 */ + + /* new attributes must be added above here */ + VDPA_ATTR_MAX, +}; + +#endif diff --git a/vdpa/include/uapi/linux/virtio_ids.h b/vdpa/include/uapi/linux/virtio_ids.h new file mode 100644 index 00000000..bc1c0621 --- /dev/null +++ b/vdpa/include/uapi/linux/virtio_ids.h @@ -0,0 +1,58 @@ +#ifndef _LINUX_VIRTIO_IDS_H +#define _LINUX_VIRTIO_IDS_H +/* + * Virtio IDs + * + * This header is BSD licensed so anyone can use the definitions to implement + * compatible drivers/servers. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ + +#define VIRTIO_ID_NET 1 /* virtio net */ +#define VIRTIO_ID_BLOCK 2 /* virtio block */ +#define VIRTIO_ID_CONSOLE 3 /* virtio console */ +#define VIRTIO_ID_RNG 4 /* virtio rng */ +#define VIRTIO_ID_BALLOON 5 /* virtio balloon */ +#define VIRTIO_ID_IOMEM 6 /* virtio ioMemory */ +#define VIRTIO_ID_RPMSG 7 /* virtio remote processor messaging */ +#define VIRTIO_ID_SCSI 8 /* virtio scsi */ +#define VIRTIO_ID_9P 9 /* 9p virtio console */ +#define VIRTIO_ID_MAC80211_WLAN 10 /* virtio WLAN MAC */ +#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ +#define VIRTIO_ID_CAIF 12 /* Virtio caif */ +#define VIRTIO_ID_MEMORY_BALLOON 13 /* virtio memory balloon */ +#define VIRTIO_ID_GPU 16 /* virtio GPU */ +#define VIRTIO_ID_CLOCK 17 /* virtio clock/timer */ +#define VIRTIO_ID_INPUT 18 /* virtio input */ +#define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ +#define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ +#define VIRTIO_ID_SIGNAL_DIST 21 /* virtio signal distribution device */ +#define VIRTIO_ID_PSTORE 22 /* virtio pstore device */ +#define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ +#define VIRTIO_ID_MEM 24 /* virtio mem */ +#define VIRTIO_ID_FS 26 /* virtio filesystem */ +#define VIRTIO_ID_PMEM 27 /* virtio pmem */ +#define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ + +#endif /* _LINUX_VIRTIO_IDS_H */ From bd3709c3a7466a3cf00e04f0d0189cad77cf17a5 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 10 Feb 2021 20:34:42 +0200 Subject: [PATCH 25/32] utils: Add helper routines for indent handling Subsequent patch needs to use 2 char indentation for nested objects. Hence introduce a generic helpers to allocate, deallocate, increment, decrement and to print indent block. Signed-off-by: Parav Pandit Signed-off-by: David Ahern --- include/utils.h | 16 ++++++++++++ lib/utils.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/include/utils.h b/include/utils.h index e66090ae..9b76c92a 100644 --- a/include/utils.h +++ b/include/utils.h @@ -349,4 +349,20 @@ int str_map_lookup_str(const struct str_num_map *map, const char *needle); const char *str_map_lookup_u16(const struct str_num_map *map, uint16_t val); const char *str_map_lookup_u8(const struct str_num_map *map, uint8_t val); +unsigned int get_str_char_count(const char *str, int match); +int str_split_by_char(char *str, char **before, char **after, int match); + +#define INDENT_STR_MAXLEN 32 + +struct indent_mem { + int indent_level; + char indent_str[INDENT_STR_MAXLEN + 1]; +}; + +struct indent_mem *alloc_indent_mem(void); +void free_indent_mem(struct indent_mem *mem); +void inc_indent(struct indent_mem *mem); +void dec_indent(struct indent_mem *mem); +void print_indent(struct indent_mem *mem); + #endif /* __UTILS_H__ */ diff --git a/lib/utils.c b/lib/utils.c index af1b553c..cc6d0e34 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -1978,3 +1978,69 @@ const char *str_map_lookup_u8(const struct str_num_map *map, uint8_t val) } return NULL; } + +unsigned int get_str_char_count(const char *str, int match) +{ + unsigned int count = 0; + const char *pos = str; + + while ((pos = strchr(pos, match))) { + count++; + pos++; + } + return count; +} + +int str_split_by_char(char *str, char **before, char **after, int match) +{ + char *slash; + + slash = strrchr(str, match); + if (!slash) + return -EINVAL; + *slash = '\0'; + *before = str; + *after = slash + 1; + return 0; +} + +struct indent_mem *alloc_indent_mem(void) +{ + struct indent_mem *mem = malloc(sizeof(*mem)); + + if (!mem) + return NULL; + strcpy(mem->indent_str, ""); + mem->indent_level = 0; + return mem; +} + +void free_indent_mem(struct indent_mem *mem) +{ + free(mem); +} + +#define INDENT_STR_STEP 2 + +void inc_indent(struct indent_mem *mem) +{ + if (mem->indent_level + INDENT_STR_STEP > INDENT_STR_MAXLEN) + return; + mem->indent_level += INDENT_STR_STEP; + memset(mem->indent_str, ' ', sizeof(mem->indent_str)); + mem->indent_str[mem->indent_level] = '\0'; +} + +void dec_indent(struct indent_mem *mem) +{ + if (mem->indent_level - INDENT_STR_STEP < 0) + return; + mem->indent_level -= INDENT_STR_STEP; + mem->indent_str[mem->indent_level] = '\0'; +} + +void print_indent(struct indent_mem *mem) +{ + if (mem->indent_level) + printf("%s", mem->indent_str); +} From b822275ad82b9a83dd962af9fb92c1b4299d0024 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 10 Feb 2021 20:34:43 +0200 Subject: [PATCH 26/32] utils: Add generic socket helpers Subsequent patch needs to (a) query and use socket family (b) send/receive messages using this family Hence add helper routines to open, close, query family and to perform send receive operations. Signed-off-by: Parav Pandit Signed-off-by: David Ahern --- include/mnl_utils.h | 16 ++++++ lib/mnl_utils.c | 121 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) diff --git a/include/mnl_utils.h b/include/mnl_utils.h index fa826ef1..9e7d6879 100644 --- a/include/mnl_utils.h +++ b/include/mnl_utils.h @@ -2,6 +2,22 @@ #ifndef __MNL_UTILS_H__ #define __MNL_UTILS_H__ 1 +struct mnlu_gen_socket { + struct mnl_socket *nl; + char *buf; + uint32_t family; + unsigned int seq; + uint8_t version; +}; + +int mnlu_gen_socket_open(struct mnlu_gen_socket *nlg, const char *family_name, + uint8_t version); +void mnlu_gen_socket_close(struct mnlu_gen_socket *nlg); +struct nlmsghdr *mnlu_gen_socket_cmd_prepare(struct mnlu_gen_socket *nlg, + uint8_t cmd, uint16_t flags); +int mnlu_gen_socket_sndrcv(struct mnlu_gen_socket *nlg, const struct nlmsghdr *nlh, + mnl_cb_t data_cb, void *data); + struct mnl_socket *mnlu_socket_open(int bus); struct nlmsghdr *mnlu_msg_prepare(void *buf, uint32_t nlmsg_type, uint16_t flags, void *extra_header, size_t extra_header_size); diff --git a/lib/mnl_utils.c b/lib/mnl_utils.c index 46384ff8..4f699455 100644 --- a/lib/mnl_utils.c +++ b/lib/mnl_utils.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "libnetlink.h" #include "mnl_utils.h" @@ -108,3 +109,123 @@ int mnlu_socket_recv_run(struct mnl_socket *nl, unsigned int seq, void *buf, siz return err; } + +static int get_family_id_attr_cb(const struct nlattr *attr, void *data) +{ + int type = mnl_attr_get_type(attr); + const struct nlattr **tb = data; + + if (mnl_attr_type_valid(attr, CTRL_ATTR_MAX) < 0) + return MNL_CB_ERROR; + + if (type == CTRL_ATTR_FAMILY_ID && + mnl_attr_validate(attr, MNL_TYPE_U16) < 0) + return MNL_CB_ERROR; + tb[type] = attr; + return MNL_CB_OK; +} + +static int get_family_id_cb(const struct nlmsghdr *nlh, void *data) +{ + struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh); + struct nlattr *tb[CTRL_ATTR_MAX + 1] = {}; + uint32_t *p_id = data; + + mnl_attr_parse(nlh, sizeof(*genl), get_family_id_attr_cb, tb); + if (!tb[CTRL_ATTR_FAMILY_ID]) + return MNL_CB_ERROR; + *p_id = mnl_attr_get_u16(tb[CTRL_ATTR_FAMILY_ID]); + return MNL_CB_OK; +} + +static int family_get(struct mnlu_gen_socket *nlg, const char *family_name) +{ + struct genlmsghdr hdr = {}; + struct nlmsghdr *nlh; + int err; + + hdr.cmd = CTRL_CMD_GETFAMILY; + hdr.version = 0x1; + + nlh = mnlu_msg_prepare(nlg->buf, GENL_ID_CTRL, + NLM_F_REQUEST | NLM_F_ACK, + &hdr, sizeof(hdr)); + + mnl_attr_put_strz(nlh, CTRL_ATTR_FAMILY_NAME, family_name); + + err = mnl_socket_sendto(nlg->nl, nlh, nlh->nlmsg_len); + if (err < 0) + return err; + + err = mnlu_socket_recv_run(nlg->nl, nlh->nlmsg_seq, nlg->buf, + MNL_SOCKET_BUFFER_SIZE, + get_family_id_cb, &nlg->family); + return err; +} + +int mnlu_gen_socket_open(struct mnlu_gen_socket *nlg, const char *family_name, + uint8_t version) +{ + int err; + + nlg->buf = malloc(MNL_SOCKET_BUFFER_SIZE); + if (!nlg->buf) + goto err_buf_alloc; + + nlg->nl = mnlu_socket_open(NETLINK_GENERIC); + if (!nlg->nl) + goto err_socket_open; + + err = family_get(nlg, family_name); + if (err) + goto err_socket; + + return 0; + +err_socket: + mnl_socket_close(nlg->nl); +err_socket_open: + free(nlg->buf); +err_buf_alloc: + return -1; +} + +void mnlu_gen_socket_close(struct mnlu_gen_socket *nlg) +{ + mnl_socket_close(nlg->nl); + free(nlg->buf); +} + +struct nlmsghdr *mnlu_gen_socket_cmd_prepare(struct mnlu_gen_socket *nlg, + uint8_t cmd, uint16_t flags) +{ + struct genlmsghdr hdr = {}; + struct nlmsghdr *nlh; + + hdr.cmd = cmd; + hdr.version = nlg->version; + nlh = mnlu_msg_prepare(nlg->buf, nlg->family, flags, &hdr, sizeof(hdr)); + nlg->seq = nlh->nlmsg_seq; + return nlh; +} + +int mnlu_gen_socket_sndrcv(struct mnlu_gen_socket *nlg, const struct nlmsghdr *nlh, + mnl_cb_t data_cb, void *data) +{ + int err; + + err = mnl_socket_sendto(nlg->nl, nlh, nlh->nlmsg_len); + if (err < 0) { + perror("Failed to send data"); + return -errno; + } + + err = mnlu_socket_recv_run(nlg->nl, nlh->nlmsg_seq, nlg->buf, + MNL_SOCKET_BUFFER_SIZE, + data_cb, data); + if (err < 0) { + fprintf(stderr, "kernel answers: %s\n", strerror(errno)); + return -errno; + } + return 0; +} From 6c769949827584984e5b76402d70be84b25706a9 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 10 Feb 2021 20:34:44 +0200 Subject: [PATCH 27/32] utils: Add helper to map string to unsigned int In subsequent patch need to map a string to a unsigned int. Hence, add an API to map a string to unsigned int. Signed-off-by: Parav Pandit Signed-off-by: David Ahern --- include/utils.h | 4 +++- lib/utils.c | 17 +++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/utils.h b/include/utils.h index 9b76c92a..b29c3798 100644 --- a/include/utils.h +++ b/include/utils.h @@ -342,10 +342,12 @@ int parse_mapping(int *argcp, char ***argvp, bool allow_all, struct str_num_map { const char *str; - int num; + unsigned int num; }; int str_map_lookup_str(const struct str_num_map *map, const char *needle); +const char *str_map_lookup_uint(const struct str_num_map *map, + unsigned int val); const char *str_map_lookup_u16(const struct str_num_map *map, uint16_t val); const char *str_map_lookup_u8(const struct str_num_map *map, uint8_t val); diff --git a/lib/utils.c b/lib/utils.c index cc6d0e34..633f6359 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -1953,9 +1953,22 @@ int str_map_lookup_str(const struct str_num_map *map, const char *needle) return -EINVAL; } +const char *str_map_lookup_uint(const struct str_num_map *map, unsigned int val) +{ + unsigned int num = val; + + while (map && map->str) { + if (num == map->num) + return map->str; + + map++; + } + return NULL; +} + const char *str_map_lookup_u16(const struct str_num_map *map, uint16_t val) { - int num = val; + unsigned int num = val; while (map && map->str) { if (num == map->num) @@ -1968,7 +1981,7 @@ const char *str_map_lookup_u16(const struct str_num_map *map, uint16_t val) const char *str_map_lookup_u8(const struct str_num_map *map, uint8_t val) { - int num = val; + unsigned int num = val; while (map && map->str) { if (num == map->num) From c2ecc82b9d4cd3b71fa992afee05ac938eb1b66a Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 10 Feb 2021 20:34:45 +0200 Subject: [PATCH 28/32] vdpa: Add vdpa tool vdpa tool is created to create, delete and query vdpa devices. examples: Show vdpa management device that supports creating, deleting vdpa devices. $ vdpa mgmtdev show vdpasim: supported_classes net $ vdpa mgmtdev show -jp { "show": { "vdpasim": { "supported_classes": [ "net" ] } } } Create a vdpa device of type networking named as "foo2" from the management device vdpasim_net: $ vdpa dev add mgmtdev vdpasim_net name foo2 Show the newly created vdpa device by its name: $ vdpa dev show foo2 foo2: type network mgmtdev vdpasim_net vendor_id 0 max_vqs 2 max_vq_size 256 $ vdpa dev show foo2 -jp { "dev": { "foo2": { "type": "network", "mgmtdev": "vdpasim_net", "vendor_id": 0, "max_vqs": 2, "max_vq_size": 256 } } } Delete the vdpa device after its use: $ vdpa dev del foo2 Signed-off-by: Parav Pandit Signed-off-by: David Ahern --- Makefile | 2 +- man/man8/vdpa-dev.8 | 96 ++++++ man/man8/vdpa-mgmtdev.8 | 53 ++++ man/man8/vdpa.8 | 76 +++++ vdpa/Makefile | 25 ++ vdpa/vdpa.c | 675 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 926 insertions(+), 1 deletion(-) create mode 100644 man/man8/vdpa-dev.8 create mode 100644 man/man8/vdpa-mgmtdev.8 create mode 100644 man/man8/vdpa.8 create mode 100644 vdpa/Makefile create mode 100644 vdpa/vdpa.c diff --git a/Makefile b/Makefile index e64c6599..19bd163e 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ WFLAGS += -Wmissing-declarations -Wold-style-definition -Wformat=2 CFLAGS := $(WFLAGS) $(CCOPTS) -I../include -I../include/uapi $(DEFINES) $(CFLAGS) YACCFLAGS = -d -t -v -SUBDIRS=lib ip tc bridge misc netem genl tipc devlink rdma dcb man +SUBDIRS=lib ip tc bridge misc netem genl tipc devlink rdma dcb man vdpa LIBNETLINK=../lib/libutil.a ../lib/libnetlink.a LDLIBS += $(LIBNETLINK) diff --git a/man/man8/vdpa-dev.8 b/man/man8/vdpa-dev.8 new file mode 100644 index 00000000..36433519 --- /dev/null +++ b/man/man8/vdpa-dev.8 @@ -0,0 +1,96 @@ +.TH DEVLINK\-DEV 8 "5 Jan 2021" "iproute2" "Linux" +.SH NAME +vdpa-dev \- vdpa device configuration +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B vdpa +.B dev +.RI "[ " OPTIONS " ] " +.RI " { " COMMAND | " " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] +} + +.ti -8 +.B vdpa dev show +.RI "[ " DEV " ]" + +.ti -8 +.B vdpa dev help + +.ti -8 +.B vdpa dev add +.B name +.I NAME +.B mgmtdev +.I MGMTDEV + +.ti -8 +.B vdpa dev del +.I DEV + +.SH "DESCRIPTION" +.SS vdpa dev show - display vdpa device attributes + +.PP +.I "DEV" +- specifies the vdpa device to show. +If this argument is omitted all devices are listed. + +.in +4 +Format is: +.in +2 +VDPA_DEVICE_NAME + +.SS vdpa dev add - add a new vdpa device. + +.TP +.BI name " NAME" +Name of the new vdpa device to add. + +.TP +.BI mgmtdev " MGMTDEV" +Name of the management device to use for device addition. + +.SS vdpa dev del - Delete the vdpa device. + +.PP +.I "DEV" +- specifies the vdpa device to delete. + +.SH "EXAMPLES" +.PP +vdpa dev show +.RS 4 +Shows the all vdpa devices on the system. +.RE +.PP +vdpa dev show foo +.RS 4 +Shows the specified vdpa device. +.RE +.PP +vdpa dev add name foo mgmtdev vdpa_sim_net +.RS 4 +Add the vdpa device named foo on the management device vdpa_sim_net. +.RE +.PP +vdpa dev del foo +.RS 4 +Delete the vdpa device named foo which was previously created. +.RE + +.SH SEE ALSO +.BR vdpa (8), +.BR vdpa-mgmtdev (8), +.br + +.SH AUTHOR +Parav Pandit diff --git a/man/man8/vdpa-mgmtdev.8 b/man/man8/vdpa-mgmtdev.8 new file mode 100644 index 00000000..cae2cbd0 --- /dev/null +++ b/man/man8/vdpa-mgmtdev.8 @@ -0,0 +1,53 @@ +.TH DEVLINK\-DEV 8 "5 Jan 2021" "iproute2" "Linux" +.SH NAME +vdpa-dev \- vdpa management device view +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B vdpa +.B mgmtdev +.RI " { " COMMAND | " " +.BR help " }" +.sp + +.ti -8 +.IR OPTIONS " := { " +\fB\-V\fR[\fIersion\fR] +} + +.ti -8 +.B vdpa mgmtdev show +.RI "[ " MGMTDEV " ]" + +.ti -8 +.B vdpa mgmtdev help + +.SH "DESCRIPTION" +.SS vdpa mgmtdev show - display vdpa management device attributes + +.PP +.I "MGMTDEV" +- specifies the vdpa management device to show. +If this argument is omitted all management devices are listed. + +.SH "EXAMPLES" +.PP +vdpa mgmtdev show +.RS 4 +Shows all the vdpa management devices on the system. +.RE +.PP +vdpa mgmtdev show bar +.RS 4 +Shows the specified vdpa management device. +.RE + +.SH SEE ALSO +.BR vdpa (8), +.BR vdpa-dev (8), +.br + +.SH AUTHOR +Parav Pandit diff --git a/man/man8/vdpa.8 b/man/man8/vdpa.8 new file mode 100644 index 00000000..d1aaecec --- /dev/null +++ b/man/man8/vdpa.8 @@ -0,0 +1,76 @@ +.TH VDPA 8 "5 Jan 2021" "iproute2" "Linux" +.SH NAME +vdpa \- vdpa management tool +.SH SYNOPSIS +.sp +.ad l +.in +8 +.ti -8 +.B vdpa +.RI "[ " OPTIONS " ] { " dev | mgmtdev " } { " COMMAND " | " +.BR help " }" +.sp + +.SH OPTIONS + +.TP +.BR "\-V" , " --Version" +Print the version of the +.B vdpa +utility and exit. + +.TP +.BR "\-j" , " --json" +Generate JSON output. + +.TP +.BR "\-p" , " --pretty" +When combined with -j generate a pretty JSON output. + +.SS +.I OBJECT + +.TP +.B dev +- vdpa device. + +.TP +.B mgmtdev +- vdpa management device. + +.SS +.I COMMAND + +Specifies the action to perform on the object. +The set of possible actions depends on the object type. +It is possible to +.B show +(or +.B list +) objects. The +.B help +command is available for all objects. It prints +out a list of available commands and argument syntax conventions. +.sp +If no command is given, some default command is assumed. +Usually it is +.B show +or, if the objects of this class cannot be listed, +.BR "help" . + +.SH EXIT STATUS +Exit status is 0 if command was successful or a positive integer upon failure. + +.SH SEE ALSO +.BR vdpa-dev (8), +.BR vdpa-mgmtdev (8), +.br + +.SH REPORTING BUGS +Report any bugs to the Network Developers mailing list +.B +where the development and maintenance is primarily done. +You do not have to be subscribed to the list to send a message there. + +.SH AUTHOR +Parav Pandit diff --git a/vdpa/Makefile b/vdpa/Makefile new file mode 100644 index 00000000..253e20a7 --- /dev/null +++ b/vdpa/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0 +include ../config.mk + +TARGETS := + +ifeq ($(HAVE_MNL),y) + +CFLAGS += -I./include/uapi/ +VDPAOBJ = vdpa.o +TARGETS += vdpa + +endif + +all: $(TARGETS) $(LIBS) + +vdpa: $(VDPAOBJ) + $(QUIET_LINK)$(CC) $^ $(LDFLAGS) $(LDLIBS) -o $@ + +install: all + for i in $(TARGETS); \ + do install -m 0755 $$i $(DESTDIR)$(SBINDIR); \ + done + +clean: + rm -f $(VDPAOBJ) $(TARGETS) diff --git a/vdpa/vdpa.c b/vdpa/vdpa.c new file mode 100644 index 00000000..7fdb36b9 --- /dev/null +++ b/vdpa/vdpa.c @@ -0,0 +1,675 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "mnl_utils.h" + +#include "version.h" +#include "json_print.h" +#include "utils.h" + +#define VDPA_OPT_MGMTDEV_HANDLE BIT(0) +#define VDPA_OPT_VDEV_MGMTDEV_HANDLE BIT(1) +#define VDPA_OPT_VDEV_NAME BIT(2) +#define VDPA_OPT_VDEV_HANDLE BIT(3) + +struct vdpa_opts { + uint64_t present; /* flags of present items */ + char *mdev_bus_name; + char *mdev_name; + const char *vdev_name; + unsigned int device_id; +}; + +struct vdpa { + struct mnlu_gen_socket nlg; + struct vdpa_opts opts; + bool json_output; + struct indent_mem *indent; +}; + +static void pr_out_section_start(struct vdpa *vdpa, const char *name) +{ + open_json_object(NULL); + open_json_object(name); +} + +static void pr_out_section_end(struct vdpa *vdpa) +{ + close_json_object(); + close_json_object(); +} + +static void pr_out_array_start(struct vdpa *vdpa, const char *name) +{ + if (!vdpa->json_output) { + print_nl(); + inc_indent(vdpa->indent); + print_indent(vdpa->indent); + } + open_json_array(PRINT_ANY, name); +} + +static void pr_out_array_end(struct vdpa *vdpa) +{ + close_json_array(PRINT_JSON, NULL); + if (!vdpa->json_output) + dec_indent(vdpa->indent); +} + +static const enum mnl_attr_data_type vdpa_policy[VDPA_ATTR_MAX + 1] = { + [VDPA_ATTR_MGMTDEV_BUS_NAME] = MNL_TYPE_NUL_STRING, + [VDPA_ATTR_MGMTDEV_DEV_NAME] = MNL_TYPE_NUL_STRING, + [VDPA_ATTR_DEV_NAME] = MNL_TYPE_STRING, + [VDPA_ATTR_DEV_ID] = MNL_TYPE_U32, + [VDPA_ATTR_DEV_VENDOR_ID] = MNL_TYPE_U32, + [VDPA_ATTR_DEV_MAX_VQS] = MNL_TYPE_U32, + [VDPA_ATTR_DEV_MAX_VQ_SIZE] = MNL_TYPE_U16, +}; + +static int attr_cb(const struct nlattr *attr, void *data) +{ + const struct nlattr **tb = data; + int type; + + if (mnl_attr_type_valid(attr, VDPA_ATTR_MAX) < 0) + return MNL_CB_OK; + + type = mnl_attr_get_type(attr); + if (mnl_attr_validate(attr, vdpa_policy[type]) < 0) + return MNL_CB_ERROR; + + tb[type] = attr; + return MNL_CB_OK; +} + +static int vdpa_argv_handle(struct vdpa *vdpa, int argc, char **argv, + char **p_mdev_bus_name, + char **p_mdev_name) +{ + unsigned int slashcount; + char *str; + + if (argc <= 0 || *argv == NULL) { + fprintf(stderr, + "vdpa identification (\"mgmtdev_bus_name/mgmtdev_name\") expected\n"); + return -EINVAL; + } + str = *argv; + slashcount = get_str_char_count(str, '/'); + if (slashcount > 1) { + fprintf(stderr, + "Wrong vdpa mgmtdev identification string format\n"); + fprintf(stderr, "Expected \"mgmtdev_bus_name/mgmtdev_name\"\n"); + fprintf(stderr, "Expected \"mgmtdev_name\"\n"); + return -EINVAL; + } + switch (slashcount) { + case 0: + *p_mdev_bus_name = NULL; + *p_mdev_name = str; + return 0; + case 1: + str_split_by_char(str, p_mdev_bus_name, p_mdev_name, '/'); + return 0; + default: + return -EINVAL; + } +} + +static int vdpa_argv_str(struct vdpa *vdpa, int argc, char **argv, + const char **p_str) +{ + if (argc <= 0 || *argv == NULL) { + fprintf(stderr, "String parameter expected\n"); + return -EINVAL; + } + *p_str = *argv; + return 0; +} + +struct vdpa_args_metadata { + uint64_t o_flag; + const char *err_msg; +}; + +static const struct vdpa_args_metadata vdpa_args_required[] = { + {VDPA_OPT_VDEV_MGMTDEV_HANDLE, "management device handle not set."}, + {VDPA_OPT_VDEV_NAME, "device name is not set."}, + {VDPA_OPT_VDEV_HANDLE, "device name is not set."}, +}; + +static int vdpa_args_finding_required_validate(uint64_t o_required, + uint64_t o_found) +{ + uint64_t o_flag; + int i; + + for (i = 0; i < ARRAY_SIZE(vdpa_args_required); i++) { + o_flag = vdpa_args_required[i].o_flag; + if ((o_required & o_flag) && !(o_found & o_flag)) { + fprintf(stderr, "%s\n", vdpa_args_required[i].err_msg); + return -EINVAL; + } + } + if (o_required & ~o_found) { + fprintf(stderr, + "BUG: unknown argument required but not found\n"); + return -EINVAL; + } + return 0; +} + +static void vdpa_opts_put(struct nlmsghdr *nlh, struct vdpa *vdpa) +{ + struct vdpa_opts *opts = &vdpa->opts; + + if ((opts->present & VDPA_OPT_MGMTDEV_HANDLE) || + (opts->present & VDPA_OPT_VDEV_MGMTDEV_HANDLE)) { + if (opts->mdev_bus_name) + mnl_attr_put_strz(nlh, VDPA_ATTR_MGMTDEV_BUS_NAME, + opts->mdev_bus_name); + mnl_attr_put_strz(nlh, VDPA_ATTR_MGMTDEV_DEV_NAME, + opts->mdev_name); + } + if ((opts->present & VDPA_OPT_VDEV_NAME) || + (opts->present & VDPA_OPT_VDEV_HANDLE)) + mnl_attr_put_strz(nlh, VDPA_ATTR_DEV_NAME, opts->vdev_name); +} + +static int vdpa_argv_parse(struct vdpa *vdpa, int argc, char **argv, + uint64_t o_required) +{ + struct vdpa_opts *opts = &vdpa->opts; + uint64_t o_all = o_required; + uint64_t o_found = 0; + int err; + + if (o_required & VDPA_OPT_MGMTDEV_HANDLE) { + err = vdpa_argv_handle(vdpa, argc, argv, &opts->mdev_bus_name, + &opts->mdev_name); + if (err) + return err; + + NEXT_ARG_FWD(); + o_found |= VDPA_OPT_MGMTDEV_HANDLE; + } else if (o_required & VDPA_OPT_VDEV_HANDLE) { + err = vdpa_argv_str(vdpa, argc, argv, &opts->vdev_name); + if (err) + return err; + + NEXT_ARG_FWD(); + o_found |= VDPA_OPT_VDEV_HANDLE; + } + + while (NEXT_ARG_OK()) { + if ((matches(*argv, "name") == 0) && + (o_all & VDPA_OPT_VDEV_NAME)) { + const char *namestr; + + NEXT_ARG_FWD(); + err = vdpa_argv_str(vdpa, argc, argv, &namestr); + if (err) + return err; + opts->vdev_name = namestr; + NEXT_ARG_FWD(); + o_found |= VDPA_OPT_VDEV_NAME; + } else if ((matches(*argv, "mgmtdev") == 0) && + (o_all & VDPA_OPT_VDEV_MGMTDEV_HANDLE)) { + NEXT_ARG_FWD(); + err = vdpa_argv_handle(vdpa, argc, argv, + &opts->mdev_bus_name, + &opts->mdev_name); + if (err) + return err; + + NEXT_ARG_FWD(); + o_found |= VDPA_OPT_VDEV_MGMTDEV_HANDLE; + } else { + fprintf(stderr, "Unknown option \"%s\"\n", *argv); + return -EINVAL; + } + } + + opts->present = o_found; + + return vdpa_args_finding_required_validate(o_required, o_found); +} + +static int vdpa_argv_parse_put(struct nlmsghdr *nlh, struct vdpa *vdpa, + int argc, char **argv, + uint64_t o_required) +{ + int err; + + err = vdpa_argv_parse(vdpa, argc, argv, o_required); + if (err) + return err; + vdpa_opts_put(nlh, vdpa); + return 0; +} + +static void cmd_mgmtdev_help(void) +{ + fprintf(stderr, "Usage: vdpa mgmtdev show [ DEV ]\n"); +} + +static void pr_out_handle_start(struct vdpa *vdpa, struct nlattr **tb) +{ + const char *mdev_bus_name = NULL; + const char *mdev_name; + SPRINT_BUF(buf); + + mdev_name = mnl_attr_get_str(tb[VDPA_ATTR_MGMTDEV_DEV_NAME]); + if (tb[VDPA_ATTR_MGMTDEV_BUS_NAME]) { + mdev_bus_name = mnl_attr_get_str(tb[VDPA_ATTR_MGMTDEV_BUS_NAME]); + sprintf(buf, "%s/%s", mdev_bus_name, mdev_name); + } else { + sprintf(buf, "%s", mdev_name); + } + + if (vdpa->json_output) + open_json_object(buf); + else + printf("%s: ", buf); +} + +static void pr_out_handle_end(struct vdpa *vdpa) +{ + if (vdpa->json_output) + close_json_object(); + else + print_nl(); +} + +static void __pr_out_vdev_handle_start(struct vdpa *vdpa, const char *vdev_name) +{ + SPRINT_BUF(buf); + + sprintf(buf, "%s", vdev_name); + if (vdpa->json_output) + open_json_object(buf); + else + printf("%s: ", buf); +} + +static void pr_out_vdev_handle_start(struct vdpa *vdpa, struct nlattr **tb) +{ + const char *vdev_name; + + vdev_name = mnl_attr_get_str(tb[VDPA_ATTR_DEV_NAME]); + __pr_out_vdev_handle_start(vdpa, vdev_name); +} + +static void pr_out_vdev_handle_end(struct vdpa *vdpa) +{ + if (vdpa->json_output) + close_json_object(); + else + print_nl(); +} + +static struct str_num_map class_map[] = { + { .str = "net", .num = VIRTIO_ID_NET }, + { .str = "block", .num = VIRTIO_ID_BLOCK }, + { .str = NULL, }, +}; + +static const char *parse_class(int num) +{ + const char *class; + + class = str_map_lookup_uint(class_map, num); + return class ? class : "< unknown class >"; +} + +static void pr_out_mgmtdev_show(struct vdpa *vdpa, const struct nlmsghdr *nlh, + struct nlattr **tb) +{ + const char *class; + unsigned int i; + + pr_out_handle_start(vdpa, tb); + + if (tb[VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES]) { + uint64_t classes = mnl_attr_get_u64(tb[VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES]); + + pr_out_array_start(vdpa, "supported_classes"); + + for (i = 1; i < 64; i++) { + if ((classes & (1ULL << i)) == 0) + continue; + + class = parse_class(i); + print_string(PRINT_ANY, NULL, " %s", class); + } + pr_out_array_end(vdpa); + } + + pr_out_handle_end(vdpa); +} + +static int cmd_mgmtdev_show_cb(const struct nlmsghdr *nlh, void *data) +{ + struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh); + struct nlattr *tb[VDPA_ATTR_MAX + 1] = {}; + struct vdpa *vdpa = data; + + mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb); + + if (!tb[VDPA_ATTR_MGMTDEV_DEV_NAME]) + return MNL_CB_ERROR; + + pr_out_mgmtdev_show(vdpa, nlh, tb); + + return MNL_CB_OK; +} + +static int cmd_mgmtdev_show(struct vdpa *vdpa, int argc, char **argv) +{ + uint16_t flags = NLM_F_REQUEST | NLM_F_ACK; + struct nlmsghdr *nlh; + int err; + + if (argc == 0) + flags |= NLM_F_DUMP; + + nlh = mnlu_gen_socket_cmd_prepare(&vdpa->nlg, VDPA_CMD_MGMTDEV_GET, + flags); + if (argc > 0) { + err = vdpa_argv_parse_put(nlh, vdpa, argc, argv, + VDPA_OPT_MGMTDEV_HANDLE); + if (err) + return err; + } + + pr_out_section_start(vdpa, "mgmtdev"); + err = mnlu_gen_socket_sndrcv(&vdpa->nlg, nlh, cmd_mgmtdev_show_cb, vdpa); + pr_out_section_end(vdpa); + return err; +} + +static int cmd_mgmtdev(struct vdpa *vdpa, int argc, char **argv) +{ + if (!argc || matches(*argv, "help") == 0) { + cmd_mgmtdev_help(); + return 0; + } else if (matches(*argv, "show") == 0 || + matches(*argv, "list") == 0) { + return cmd_mgmtdev_show(vdpa, argc - 1, argv + 1); + } + fprintf(stderr, "Command \"%s\" not found\n", *argv); + return -ENOENT; +} + +static void cmd_dev_help(void) +{ + fprintf(stderr, "Usage: vdpa dev show [ DEV ]\n"); + fprintf(stderr, " vdpa dev add name NAME mgmtdev MANAGEMENTDEV\n"); + fprintf(stderr, " vdpa dev del DEV\n"); +} + +static const char *device_type_name(uint32_t type) +{ + switch (type) { + case 0x1: return "network"; + case 0x2: return "block"; + default: return ""; + } +} + +static void pr_out_dev(struct vdpa *vdpa, struct nlattr **tb) +{ + const char *mdev_name = mnl_attr_get_str(tb[VDPA_ATTR_MGMTDEV_DEV_NAME]); + uint32_t device_id = mnl_attr_get_u32(tb[VDPA_ATTR_DEV_ID]); + const char *mdev_bus_name = NULL; + char mgmtdev_buf[128]; + + if (tb[VDPA_ATTR_MGMTDEV_BUS_NAME]) + mdev_bus_name = mnl_attr_get_str(tb[VDPA_ATTR_MGMTDEV_BUS_NAME]); + + if (mdev_bus_name) + sprintf(mgmtdev_buf, "%s/%s", mdev_bus_name, mdev_name); + else + sprintf(mgmtdev_buf, "%s", mdev_name); + pr_out_vdev_handle_start(vdpa, tb); + print_string(PRINT_ANY, "type", "type %s", device_type_name(device_id)); + print_string(PRINT_ANY, "mgmtdev", " mgmtdev %s", mgmtdev_buf); + + if (tb[VDPA_ATTR_DEV_VENDOR_ID]) + print_uint(PRINT_ANY, "vendor_id", " vendor_id %u", + mnl_attr_get_u32(tb[VDPA_ATTR_DEV_VENDOR_ID])); + if (tb[VDPA_ATTR_DEV_MAX_VQS]) + print_uint(PRINT_ANY, "max_vqs", " max_vqs %u", + mnl_attr_get_u32(tb[VDPA_ATTR_DEV_MAX_VQS])); + if (tb[VDPA_ATTR_DEV_MAX_VQ_SIZE]) + print_uint(PRINT_ANY, "max_vq_size", " max_vq_size %u", + mnl_attr_get_u16(tb[VDPA_ATTR_DEV_MAX_VQ_SIZE])); + pr_out_vdev_handle_end(vdpa); +} + +static int cmd_dev_show_cb(const struct nlmsghdr *nlh, void *data) +{ + struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh); + struct nlattr *tb[VDPA_ATTR_MAX + 1] = {}; + struct vdpa *vdpa = data; + + mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb); + if (!tb[VDPA_ATTR_MGMTDEV_DEV_NAME] || + !tb[VDPA_ATTR_DEV_NAME] || !tb[VDPA_ATTR_DEV_ID]) + return MNL_CB_ERROR; + pr_out_dev(vdpa, tb); + return MNL_CB_OK; +} + +static int cmd_dev_show(struct vdpa *vdpa, int argc, char **argv) +{ + uint16_t flags = NLM_F_REQUEST | NLM_F_ACK; + struct nlmsghdr *nlh; + int err; + + if (argc <= 0) + flags |= NLM_F_DUMP; + + nlh = mnlu_gen_socket_cmd_prepare(&vdpa->nlg, VDPA_CMD_DEV_GET, flags); + if (argc > 0) { + err = vdpa_argv_parse_put(nlh, vdpa, argc, argv, + VDPA_OPT_VDEV_HANDLE); + if (err) + return err; + } + + pr_out_section_start(vdpa, "dev"); + err = mnlu_gen_socket_sndrcv(&vdpa->nlg, nlh, cmd_dev_show_cb, vdpa); + pr_out_section_end(vdpa); + return err; +} + +static int cmd_dev_add(struct vdpa *vdpa, int argc, char **argv) +{ + struct nlmsghdr *nlh; + int err; + + nlh = mnlu_gen_socket_cmd_prepare(&vdpa->nlg, VDPA_CMD_DEV_NEW, + NLM_F_REQUEST | NLM_F_ACK); + err = vdpa_argv_parse_put(nlh, vdpa, argc, argv, + VDPA_OPT_VDEV_MGMTDEV_HANDLE | VDPA_OPT_VDEV_NAME); + if (err) + return err; + + return mnlu_gen_socket_sndrcv(&vdpa->nlg, nlh, NULL, NULL); +} + +static int cmd_dev_del(struct vdpa *vdpa, int argc, char **argv) +{ + struct nlmsghdr *nlh; + int err; + + nlh = mnlu_gen_socket_cmd_prepare(&vdpa->nlg, VDPA_CMD_DEV_DEL, + NLM_F_REQUEST | NLM_F_ACK); + err = vdpa_argv_parse_put(nlh, vdpa, argc, argv, VDPA_OPT_VDEV_HANDLE); + if (err) + return err; + + return mnlu_gen_socket_sndrcv(&vdpa->nlg, nlh, NULL, NULL); +} + +static int cmd_dev(struct vdpa *vdpa, int argc, char **argv) +{ + if (!argc) + return cmd_dev_show(vdpa, argc - 1, argv + 1); + + if (matches(*argv, "help") == 0) { + cmd_dev_help(); + return 0; + } else if (matches(*argv, "show") == 0 || + matches(*argv, "list") == 0) { + return cmd_dev_show(vdpa, argc - 1, argv + 1); + } else if (matches(*argv, "add") == 0) { + return cmd_dev_add(vdpa, argc - 1, argv + 1); + } else if (matches(*argv, "del") == 0) { + return cmd_dev_del(vdpa, argc - 1, argv + 1); + } + fprintf(stderr, "Command \"%s\" not found\n", *argv); + return -ENOENT; +} + +static void help(void) +{ + fprintf(stderr, + "Usage: vdpa [ OPTIONS ] OBJECT { COMMAND | help }\n" + "where OBJECT := { mgmtdev | dev }\n" + " OPTIONS := { -V[ersion] | -n[o-nice-names] | -j[son] | -p[retty] | -v[erbose] }\n"); +} + +static int vdpa_cmd(struct vdpa *vdpa, int argc, char **argv) +{ + if (!argc || matches(*argv, "help") == 0) { + help(); + return 0; + } else if (matches(*argv, "mgmtdev") == 0) { + return cmd_mgmtdev(vdpa, argc - 1, argv + 1); + } else if (matches(*argv, "dev") == 0) { + return cmd_dev(vdpa, argc - 1, argv + 1); + } + fprintf(stderr, "Object \"%s\" not found\n", *argv); + return -ENOENT; +} + +static int vdpa_init(struct vdpa *vdpa) +{ + int err; + + err = mnlu_gen_socket_open(&vdpa->nlg, VDPA_GENL_NAME, + VDPA_GENL_VERSION); + if (err) { + fprintf(stderr, "Failed to connect to vdpa Netlink\n"); + return -errno; + } + new_json_obj_plain(vdpa->json_output); + return 0; +} + +static void vdpa_fini(struct vdpa *vdpa) +{ + delete_json_obj_plain(); + mnlu_gen_socket_close(&vdpa->nlg); +} + +static struct vdpa *vdpa_alloc(void) +{ + struct vdpa *vdpa = calloc(1, sizeof(struct vdpa)); + + if (!vdpa) + return NULL; + + vdpa->indent = alloc_indent_mem(); + if (!vdpa->indent) + goto indent_err; + + return vdpa; + +indent_err: + free(vdpa); + return NULL; +} + +static void vdpa_free(struct vdpa *vdpa) +{ + free_indent_mem(vdpa->indent); + free(vdpa); +} + +int main(int argc, char **argv) +{ + static const struct option long_options[] = { + { "Version", no_argument, NULL, 'V' }, + { "json", no_argument, NULL, 'j' }, + { "pretty", no_argument, NULL, 'p' }, + { "help", no_argument, NULL, 'h' }, + { NULL, 0, NULL, 0 } + }; + struct vdpa *vdpa; + int opt; + int err; + int ret; + + vdpa = vdpa_alloc(); + if (!vdpa) { + fprintf(stderr, "Failed to allocate memory for vdpa\n"); + return EXIT_FAILURE; + } + + while ((opt = getopt_long(argc, argv, "Vjpsh", long_options, NULL)) >= 0) { + switch (opt) { + case 'V': + printf("vdpa utility, iproute2-%s\n", version); + ret = EXIT_SUCCESS; + goto vdpa_free; + case 'j': + vdpa->json_output = true; + break; + case 'p': + pretty = true; + break; + case 'h': + help(); + ret = EXIT_SUCCESS; + goto vdpa_free; + default: + fprintf(stderr, "Unknown option.\n"); + help(); + ret = EXIT_FAILURE; + goto vdpa_free; + } + } + + argc -= optind; + argv += optind; + + err = vdpa_init(vdpa); + if (err) { + ret = EXIT_FAILURE; + goto vdpa_free; + } + + err = vdpa_cmd(vdpa, argc, argv); + if (err) { + ret = EXIT_FAILURE; + goto vdpa_fini; + } + + ret = EXIT_SUCCESS; + +vdpa_fini: + vdpa_fini(vdpa); +vdpa_free: + vdpa_free(vdpa); + return ret; +} From c946f5d3e414a9edef6b673af5079d94d427c444 Mon Sep 17 00:00:00 2001 From: Oleksandr Mazur Date: Tue, 9 Feb 2021 12:31:51 +0200 Subject: [PATCH 29/32] devlink: add support for port params get/set Add implementation for the port parameters getting/setting. Add bash completion for port param. Add man description for port param. Signed-off-by: Oleksandr Mazur Signed-off-by: David Ahern --- bash-completion/devlink | 55 ++++++++ devlink/devlink.c | 274 +++++++++++++++++++++++++++++++++++++++- man/man8/devlink-port.8 | 65 ++++++++++ 3 files changed, 388 insertions(+), 6 deletions(-) diff --git a/bash-completion/devlink b/bash-completion/devlink index 7395b504..361be9fe 100644 --- a/bash-completion/devlink +++ b/bash-completion/devlink @@ -319,6 +319,57 @@ _devlink_port_split() esac } +# Completion for devlink port param set +_devlink_port_param_set() +{ + case $cword in + 7) + COMPREPLY=( $( compgen -W "value" -- "$cur" ) ) + return + ;; + 8) + # String argument + return + ;; + 9) + COMPREPLY=( $( compgen -W "cmode" -- "$cur" ) ) + return + ;; + 10) + COMPREPLY=( $( compgen -W "runtime driverinit permanent" -- \ + "$cur" ) ) + return + ;; + esac +} + +# Completion for devlink port param +_devlink_port_param() +{ + case "$cword" in + 3) + COMPREPLY=( $( compgen -W "show set" -- "$cur" ) ) + return + ;; + 4) + _devlink_direct_complete "port" + return + ;; + 5) + COMPREPLY=( $( compgen -W "name" -- "$cur" ) ) + return + ;; + 6) + _devlink_direct_complete "param_name" + return + ;; + esac + + if [[ "${words[3]}" == "set" ]]; then + _devlink_port_param_set + fi +} + # Completion for devlink port _devlink_port() { @@ -331,6 +382,10 @@ _devlink_port() _devlink_port_split return ;; + param) + _devlink_port_param + return + ;; show|unsplit) if [[ $cword -eq 3 ]]; then _devlink_direct_complete "port" diff --git a/devlink/devlink.c b/devlink/devlink.c index 10398f77..c6e85ff9 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -2808,7 +2808,8 @@ static void pr_out_param_value(struct dl *dl, const char *nla_name, } } -static void pr_out_param(struct dl *dl, struct nlattr **tb, bool array) +static void pr_out_param(struct dl *dl, struct nlattr **tb, bool array, + bool is_port_param) { struct nlattr *nla_param[DEVLINK_ATTR_MAX + 1] = {}; struct nlattr *param_value_attr; @@ -2825,9 +2826,15 @@ static void pr_out_param(struct dl *dl, struct nlattr **tb, bool array) return; if (array) - pr_out_handle_start_arr(dl, tb); + if (is_port_param) + pr_out_port_handle_start_arr(dl, tb, false); + else + pr_out_handle_start_arr(dl, tb); else - __pr_out_handle_start(dl, tb, true, false); + if (is_port_param) + pr_out_port_handle_start(dl, tb, false); + else + __pr_out_handle_start(dl, tb, true, false); nla_type = mnl_attr_get_u8(nla_param[DEVLINK_ATTR_PARAM_TYPE]); @@ -2847,7 +2854,10 @@ static void pr_out_param(struct dl *dl, struct nlattr **tb, bool array) pr_out_entry_end(dl); } pr_out_array_end(dl); - pr_out_handle_end(dl); + if (is_port_param) + pr_out_port_handle_end(dl); + else + pr_out_handle_end(dl); } static int cmd_dev_param_show_cb(const struct nlmsghdr *nlh, void *data) @@ -2860,7 +2870,7 @@ static int cmd_dev_param_show_cb(const struct nlmsghdr *nlh, void *data) if (!tb[DEVLINK_ATTR_BUS_NAME] || !tb[DEVLINK_ATTR_DEV_NAME] || !tb[DEVLINK_ATTR_PARAM]) return MNL_CB_ERROR; - pr_out_param(dl, tb, true); + pr_out_param(dl, tb, true, false); return MNL_CB_OK; } @@ -3058,6 +3068,21 @@ err_param_value_parse: return err; } +static int cmd_port_param_show_cb(const struct nlmsghdr *nlh, void *data) +{ + struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh); + struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {}; + struct dl *dl = data; + + mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb); + if (!tb[DEVLINK_ATTR_BUS_NAME] || !tb[DEVLINK_ATTR_DEV_NAME] || + !tb[DEVLINK_ATTR_PORT_INDEX] || !tb[DEVLINK_ATTR_PARAM]) + return MNL_CB_ERROR; + + pr_out_param(dl, tb, true, true); + return MNL_CB_OK; +} + static int cmd_dev_param_show(struct dl *dl) { uint16_t flags = NLM_F_REQUEST | NLM_F_ACK; @@ -3803,6 +3828,8 @@ static void cmd_port_help(void) pr_err(" devlink port split DEV/PORT_INDEX count COUNT\n"); pr_err(" devlink port unsplit DEV/PORT_INDEX\n"); pr_err(" devlink port function set DEV/PORT_INDEX [ hw_addr ADDR ] [ state STATE ]\n"); + pr_err(" devlink port param set DEV/PORT_INDEX name PARAMETER value VALUE cmode { permanent | driverinit | runtime }\n"); + pr_err(" devlink port param show [DEV/PORT_INDEX name PARAMETER]\n"); pr_err(" devlink port health show [ DEV/PORT_INDEX reporter REPORTER_NAME ]\n"); pr_err(" devlink port add DEV/PORT_INDEX flavour FLAVOUR pfnum PFNUM [ sfnum SFNUM ]\n"); pr_err(" devlink port del DEV/PORT_INDEX\n"); @@ -4065,6 +4092,31 @@ static int cmd_port_unsplit(struct dl *dl) return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL); } +static int cmd_port_param_show(struct dl *dl) +{ + uint16_t flags = NLM_F_REQUEST | NLM_F_ACK; + struct nlmsghdr *nlh; + int err; + + if (dl_argc(dl) == 0) + flags |= NLM_F_DUMP; + + nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_PORT_PARAM_GET, flags); + + if (dl_argc(dl) > 0) { + err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLEP | + DL_OPT_PARAM_NAME, 0); + if (err) + return err; + } + + pr_out_section_start(dl, "param"); + err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_port_param_show_cb, dl); + pr_out_section_end(dl); + + return err; +} + static void cmd_port_function_help(void) { pr_err("Usage: devlink port function set DEV/PORT_INDEX [ hw_addr ADDR ] [ state STATE ]\n"); @@ -4089,6 +4141,205 @@ static int cmd_port_function_set(struct dl *dl) return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL); } +static int cmd_port_param_set_cb(const struct nlmsghdr *nlh, void *data) +{ + struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh); + struct nlattr *nla_param[DEVLINK_ATTR_MAX + 1] = {}; + struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {}; + struct nlattr *param_value_attr; + enum devlink_param_cmode cmode; + struct param_ctx *ctx = data; + struct dl *dl = ctx->dl; + int nla_type; + int err; + + mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb); + if (!tb[DEVLINK_ATTR_BUS_NAME] || !tb[DEVLINK_ATTR_DEV_NAME] || + !tb[DEVLINK_ATTR_PORT_INDEX] || !tb[DEVLINK_ATTR_PARAM]) + return MNL_CB_ERROR; + + err = mnl_attr_parse_nested(tb[DEVLINK_ATTR_PARAM], attr_cb, nla_param); + if (err != MNL_CB_OK) + return MNL_CB_ERROR; + + if (!nla_param[DEVLINK_ATTR_PARAM_TYPE] || + !nla_param[DEVLINK_ATTR_PARAM_VALUES_LIST]) + return MNL_CB_ERROR; + + nla_type = mnl_attr_get_u8(nla_param[DEVLINK_ATTR_PARAM_TYPE]); + mnl_attr_for_each_nested(param_value_attr, + nla_param[DEVLINK_ATTR_PARAM_VALUES_LIST]) { + struct nlattr *nla_value[DEVLINK_ATTR_MAX + 1] = {}; + struct nlattr *val_attr; + + err = mnl_attr_parse_nested(param_value_attr, + attr_cb, nla_value); + if (err != MNL_CB_OK) + return MNL_CB_ERROR; + + if (!nla_value[DEVLINK_ATTR_PARAM_VALUE_CMODE] || + (nla_type != MNL_TYPE_FLAG && + !nla_value[DEVLINK_ATTR_PARAM_VALUE_DATA])) + return MNL_CB_ERROR; + + cmode = mnl_attr_get_u8(nla_value[DEVLINK_ATTR_PARAM_VALUE_CMODE]); + if (cmode == dl->opts.cmode) { + val_attr = nla_value[DEVLINK_ATTR_PARAM_VALUE_DATA]; + switch (nla_type) { + case MNL_TYPE_U8: + ctx->value.vu8 = mnl_attr_get_u8(val_attr); + break; + case MNL_TYPE_U16: + ctx->value.vu16 = mnl_attr_get_u16(val_attr); + break; + case MNL_TYPE_U32: + ctx->value.vu32 = mnl_attr_get_u32(val_attr); + break; + case MNL_TYPE_STRING: + ctx->value.vstr = mnl_attr_get_str(val_attr); + break; + case MNL_TYPE_FLAG: + ctx->value.vbool = val_attr ? true : false; + break; + } + break; + } + } + ctx->nla_type = nla_type; + return MNL_CB_OK; +} + +static int cmd_port_param_set(struct dl *dl) +{ + struct param_ctx ctx = {}; + struct nlmsghdr *nlh; + bool conv_exists; + uint32_t val_u32 = 0; + uint16_t val_u16; + uint8_t val_u8; + bool val_bool; + int err; + + err = dl_argv_parse(dl, DL_OPT_HANDLEP | + DL_OPT_PARAM_NAME | + DL_OPT_PARAM_VALUE | + DL_OPT_PARAM_CMODE, 0); + if (err) + return err; + + /* Get value type */ + nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_PORT_PARAM_GET, + NLM_F_REQUEST | NLM_F_ACK); + dl_opts_put(nlh, dl); + + ctx.dl = dl; + err = _mnlg_socket_sndrcv(dl->nlg, nlh, cmd_port_param_set_cb, &ctx); + if (err) + return err; + + nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_PORT_PARAM_SET, + NLM_F_REQUEST | NLM_F_ACK); + dl_opts_put(nlh, dl); + + conv_exists = param_val_conv_exists(param_val_conv, PARAM_VAL_CONV_LEN, + dl->opts.param_name); + + mnl_attr_put_u8(nlh, DEVLINK_ATTR_PARAM_TYPE, ctx.nla_type); + switch (ctx.nla_type) { + case MNL_TYPE_U8: + if (conv_exists) { + err = param_val_conv_uint_get(param_val_conv, + PARAM_VAL_CONV_LEN, + dl->opts.param_name, + dl->opts.param_value, + &val_u32); + val_u8 = val_u32; + } else { + err = strtouint8_t(dl->opts.param_value, &val_u8); + } + if (err) + goto err_param_value_parse; + if (val_u8 == ctx.value.vu8) + return 0; + mnl_attr_put_u8(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, val_u8); + break; + case MNL_TYPE_U16: + if (conv_exists) { + err = param_val_conv_uint_get(param_val_conv, + PARAM_VAL_CONV_LEN, + dl->opts.param_name, + dl->opts.param_value, + &val_u32); + val_u16 = val_u32; + } else { + err = strtouint16_t(dl->opts.param_value, &val_u16); + } + if (err) + goto err_param_value_parse; + if (val_u16 == ctx.value.vu16) + return 0; + mnl_attr_put_u16(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, val_u16); + break; + case MNL_TYPE_U32: + if (conv_exists) + err = param_val_conv_uint_get(param_val_conv, + PARAM_VAL_CONV_LEN, + dl->opts.param_name, + dl->opts.param_value, + &val_u32); + else + err = strtouint32_t(dl->opts.param_value, &val_u32); + if (err) + goto err_param_value_parse; + if (val_u32 == ctx.value.vu32) + return 0; + mnl_attr_put_u32(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, val_u32); + break; + case MNL_TYPE_FLAG: + err = strtobool(dl->opts.param_value, &val_bool); + if (err) + goto err_param_value_parse; + if (val_bool == ctx.value.vbool) + return 0; + if (val_bool) + mnl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, + 0, NULL); + break; + case MNL_TYPE_STRING: + mnl_attr_put_strz(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, + dl->opts.param_value); + if (!strcmp(dl->opts.param_value, ctx.value.vstr)) + return 0; + break; + default: + printf("Value type not supported\n"); + return -ENOTSUP; + } + return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL); + +err_param_value_parse: + pr_err("Value \"%s\" is not a number or not within range\n", + dl->opts.param_value); + return err; +} + +static int cmd_port_param(struct dl *dl) +{ + if (dl_argv_match(dl, "help")) { + cmd_port_help(); + return 0; + } else if (dl_argv_match(dl, "show") || + dl_argv_match(dl, "list") || dl_no_arg(dl)) { + dl_arg_inc(dl); + return cmd_port_param_show(dl); + } else if (dl_argv_match(dl, "set")) { + dl_arg_inc(dl); + return cmd_port_param_set(dl); + } + pr_err("Command \"%s\" not found\n", dl_argv(dl)); + return -ENOENT; +} + static int cmd_port_function(struct dl *dl) { if (dl_argv_match(dl, "help") || dl_no_arg(dl)) { @@ -4175,6 +4426,9 @@ static int cmd_port(struct dl *dl) } else if (dl_argv_match(dl, "unsplit")) { dl_arg_inc(dl); return cmd_port_unsplit(dl); + } else if (dl_argv_match(dl, "param")) { + dl_arg_inc(dl); + return cmd_port_param(dl); } else if (dl_argv_match(dl, "function")) { dl_arg_inc(dl); return cmd_port_function(dl); @@ -4996,6 +5250,10 @@ static const char *cmd_name(uint8_t cmd) case DEVLINK_CMD_REGION_SET: return "set"; case DEVLINK_CMD_REGION_NEW: return "new"; case DEVLINK_CMD_REGION_DEL: return "del"; + case DEVLINK_CMD_PORT_PARAM_GET: return "get"; + case DEVLINK_CMD_PORT_PARAM_SET: return "set"; + case DEVLINK_CMD_PORT_PARAM_NEW: return "new"; + case DEVLINK_CMD_PORT_PARAM_DEL: return "del"; case DEVLINK_CMD_FLASH_UPDATE: return "begin"; case DEVLINK_CMD_FLASH_UPDATE_END: return "end"; case DEVLINK_CMD_FLASH_UPDATE_STATUS: return "status"; @@ -5034,6 +5292,10 @@ static const char *cmd_obj(uint8_t cmd) case DEVLINK_CMD_PARAM_SET: case DEVLINK_CMD_PARAM_NEW: case DEVLINK_CMD_PARAM_DEL: + case DEVLINK_CMD_PORT_PARAM_GET: + case DEVLINK_CMD_PORT_PARAM_SET: + case DEVLINK_CMD_PORT_PARAM_NEW: + case DEVLINK_CMD_PORT_PARAM_DEL: return "param"; case DEVLINK_CMD_REGION_GET: case DEVLINK_CMD_REGION_SET: @@ -5176,7 +5438,7 @@ static int cmd_mon_show_cb(const struct nlmsghdr *nlh, void *data) !tb[DEVLINK_ATTR_PARAM]) return MNL_CB_ERROR; pr_out_mon_header(genl->cmd); - pr_out_param(dl, tb, false); + pr_out_param(dl, tb, false, false); pr_out_mon_footer(); break; case DEVLINK_CMD_REGION_GET: /* fall through */ diff --git a/man/man8/devlink-port.8 b/man/man8/devlink-port.8 index 55f1cce6..563c5833 100644 --- a/man/man8/devlink-port.8 +++ b/man/man8/devlink-port.8 @@ -70,6 +70,23 @@ devlink-port \- devlink port configuration .BR "state" .RI "STATE }" +.ti -8 +.B devlink dev param set +.I DEV/PORT_INDEX +.B name +.I PARAMETER +.B value +.I VALUE +.BR cmode " { " runtime " | " driverinit " | " permanent " } " + +.ti -8 +.B devlink dev param show +[ +.I DEV/PORT_INDEX +.B name +.I PARAMETER +] + .ti -8 .B devlink port help @@ -185,6 +202,44 @@ port type is Ethernet. .B "DEV/PORT_INDEX" - specifies the devlink port to delete. +.ti -8 +.SS devlink port param set - set new value to devlink port configuration parameter +.PP +.B "DEV/PORT_INDEX" +- specifies the devlink port to operate on. + +.TP +.BI name " PARAMETER" +Specify parameter name to set. + +.TP +.BI value " VALUE" +New value to set. + +.TP +.BR cmode " { " runtime " | " driverinit " | " permanent " } " +Configuration mode in which the new value is set. + +.I runtime +- Set new value while driver is running. This configuration mode doesn't require any reset to apply the new value. + +.I driverinit +- Set new value which will be applied during driver initialization. This configuration mode requires restart driver by devlink reload command to apply the new value. + +.I permanent +- New value is written to device's non-volatile memory. This configuration mode requires hard reset to apply the new value. + +.SS devlink port param show - display devlink port supported configuration parameters attributes + +.PP +.B "DEV/PORT_INDEX" +- specifies the devlink port to operate on. + +.B name +.I PARAMETER +Specify parameter name to show. +If this argument, as well as port index, are omitted - all parameters supported by devlink device ports are listed. + .SH "EXAMPLES" .PP devlink port show @@ -262,6 +317,16 @@ Configure hardware address and also active the function. When a function is activated together with other configuration in a single command, all the configuration is applied first before changing the state to active. .RE +.PP +devlink dev param show +.RS 4 +Shows (dumps) all the port parameters across all the devices registered in the devlink. +.RE +.PP +devlink dev param set pci/0000:01:00.0/1 name internal_error_reset value true cmode runtime +.RS 4 +Sets the parameter internal_error_reset of specified devlink port (#1) to true. +.RE .SH SEE ALSO .BR devlink (8), From 34de4b26bfcfb0d95bbf404e02d46efe3e2e8361 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 13 Feb 2021 17:48:05 -0700 Subject: [PATCH 30/32] Update kernel headers Update kernel headers to commit: c4762993129f ("Merge branch 'skbuff-introduce-skbuff_heads-bulking-and-reusing'") Signed-off-by: David Ahern --- include/uapi/linux/if_link.h | 1 - include/uapi/linux/if_tunnel.h | 1 - include/uapi/linux/mptcp.h | 74 ++++++++++++++++++++++++++++++++++ include/uapi/linux/rtnetlink.h | 5 +++ include/uapi/linux/tcp.h | 2 +- 5 files changed, 80 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index c96880c5..50193377 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -809,7 +809,6 @@ enum { IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, - IFLA_GTP_COLLECT_METADATA, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index cb32781c..c7f0a5e6 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -176,7 +176,6 @@ enum { #define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) #define TUNNEL_NOCACHE __cpu_to_be16(0x2000) #define TUNNEL_ERSPAN_OPT __cpu_to_be16(0x4000) -#define TUNNEL_GTPU_OPT __cpu_to_be16(0x8000) #define TUNNEL_OPTIONS_PRESENT \ (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT) diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index d31c9c6c..58ce1c6a 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -36,6 +36,7 @@ enum { /* netlink interface */ #define MPTCP_PM_NAME "mptcp_pm" #define MPTCP_PM_CMD_GRP_NAME "mptcp_pm_cmds" +#define MPTCP_PM_EV_GRP_NAME "mptcp_pm_events" #define MPTCP_PM_VER 0x1 /* @@ -104,4 +105,77 @@ struct mptcp_info { __u64 mptcpi_rcv_nxt; }; +/* + * MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport + * A new MPTCP connection has been created. It is the good time to allocate + * memory and send ADD_ADDR if needed. Depending on the traffic-patterns + * it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. + * + * MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport + * A MPTCP connection is established (can start new subflows). + * + * MPTCP_EVENT_CLOSED: token + * A MPTCP connection has stopped. + * + * MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] + * A new address has been announced by the peer. + * + * MPTCP_EVENT_REMOVED: token, rem_id + * An address has been lost by the peer. + * + * MPTCP_EVENT_SUB_ESTABLISHED: token, family, saddr4 | saddr6, + * daddr4 | daddr6, sport, dport, backup, + * if_idx [, error] + * A new subflow has been established. 'error' should not be set. + * + * MPTCP_EVENT_SUB_CLOSED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, backup, if_idx [, error] + * A subflow has been closed. An error (copy of sk_err) could be set if an + * error has been detected for this subflow. + * + * MPTCP_EVENT_SUB_PRIORITY: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, backup, if_idx [, error] + * The priority of a subflow has changed. 'error' should not be set. + */ +enum mptcp_event_type { + MPTCP_EVENT_UNSPEC = 0, + MPTCP_EVENT_CREATED = 1, + MPTCP_EVENT_ESTABLISHED = 2, + MPTCP_EVENT_CLOSED = 3, + + MPTCP_EVENT_ANNOUNCED = 6, + MPTCP_EVENT_REMOVED = 7, + + MPTCP_EVENT_SUB_ESTABLISHED = 10, + MPTCP_EVENT_SUB_CLOSED = 11, + + MPTCP_EVENT_SUB_PRIORITY = 13, +}; + +enum mptcp_event_attr { + MPTCP_ATTR_UNSPEC = 0, + + MPTCP_ATTR_TOKEN, /* u32 */ + MPTCP_ATTR_FAMILY, /* u16 */ + MPTCP_ATTR_LOC_ID, /* u8 */ + MPTCP_ATTR_REM_ID, /* u8 */ + MPTCP_ATTR_SADDR4, /* be32 */ + MPTCP_ATTR_SADDR6, /* struct in6_addr */ + MPTCP_ATTR_DADDR4, /* be32 */ + MPTCP_ATTR_DADDR6, /* struct in6_addr */ + MPTCP_ATTR_SPORT, /* be16 */ + MPTCP_ATTR_DPORT, /* be16 */ + MPTCP_ATTR_BACKUP, /* u8 */ + MPTCP_ATTR_ERROR, /* u8 */ + MPTCP_ATTR_FLAGS, /* u16 */ + MPTCP_ATTR_TIMEOUT, /* u32 */ + MPTCP_ATTR_IF_IDX, /* s32 */ + + __MPTCP_ATTR_AFTER_LAST +}; + +#define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1) + #endif /* _MPTCP_H */ diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index c66fd247..b34b9add 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -319,6 +319,11 @@ enum rt_scope_t { #define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */ #define RTM_F_OFFLOAD 0x4000 /* route is offloaded */ #define RTM_F_TRAP 0x8000 /* route is trapping packets */ +#define RTM_F_OFFLOAD_FAILED 0x20000000 /* route offload failed, this value + * is chosen to avoid conflicts with + * other flags defined in + * include/uapi/linux/ipv6_route.h + */ /* Reserved table identifiers */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 0614c608..a2066278 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -357,6 +357,6 @@ struct tcp_zerocopy_receive { __u64 msg_control; /* ancillary data */ __u64 msg_controllen; __u32 msg_flags; - /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ + __u32 reserved; /* set to 0 for now */ }; #endif /* _LINUX_TCP_H */ From 33e2471e8f47a532123ee2642e97b6a425ac84ac Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Tue, 9 Feb 2021 11:12:00 +0200 Subject: [PATCH 31/32] ip route: Print "rt_offload_failed" indication The kernel signals when offload fails using the 'RTM_F_OFFLOAD_FAILED' flag. Print it to help users understand the offload state of the route. The "rt_" prefix is used in order to distinguish it from the offload state of nexthops, similar to "rt_offload" and "rt_trap". Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: David Ahern --- ip/iproute.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ip/iproute.c b/ip/iproute.c index ebb5f160..a8c4886b 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -374,6 +374,8 @@ void print_rt_flags(FILE *fp, unsigned int flags) print_string(PRINT_ANY, NULL, "%s ", "rt_offload"); if (flags & RTM_F_TRAP) print_string(PRINT_ANY, NULL, "%s ", "rt_trap"); + if (flags & RTM_F_OFFLOAD_FAILED) + print_string(PRINT_ANY, NULL, "%s ", "rt_offload_failed"); close_json_array(PRINT_JSON, NULL); } From c7897ec2a68b444b8aecc7aaeed8b80a5eefa7ea Mon Sep 17 00:00:00 2001 From: Thayne McCombs Date: Sun, 14 Feb 2021 01:09:13 -0700 Subject: [PATCH 32/32] ss: Make leading ":" always optional for sport and dport The sport and dport conditions in expressions were inconsistent on whether there should be a ":" at the beginning of the port when only a port was provided depending on the family. The link and netlink families required a ":" to work. The vsock family required the ":" to be absent. The inet and inet6 families work with or without a leading ":". This makes the leading ":" optional in all cases, so if sport or dport are used, then it works with a leading ":" or without one, as inet and inet6 did. Signed-off-by: Thayne McCombs Signed-off-by: David Ahern --- misc/ss.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/misc/ss.c b/misc/ss.c index aefa1c2f..5c934fa0 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -2111,6 +2111,18 @@ static void vsock_set_inet_prefix(inet_prefix *a, __u32 cid) memcpy(a->data, &cid, sizeof(cid)); } +static char* find_port(char *addr, bool is_port) +{ + char *port = NULL; + if (is_port) + port = addr; + else + port = strchr(addr, ':'); + if (port && *port == ':') + *port++ = '\0'; + return port; +} + void *parse_hostcond(char *addr, bool is_port) { char *port = NULL; @@ -2152,17 +2164,16 @@ void *parse_hostcond(char *addr, bool is_port) if (fam == AF_PACKET) { a.addr.family = AF_PACKET; a.addr.bitlen = 0; - port = strchr(addr, ':'); + port = find_port(addr, is_port); if (port) { - *port = 0; - if (port[1] && strcmp(port+1, "*")) { - if (get_integer(&a.port, port+1, 0)) { - if ((a.port = xll_name_to_index(port+1)) <= 0) + if (*port && strcmp(port, "*")) { + if (get_integer(&a.port, port, 0)) { + if ((a.port = xll_name_to_index(port)) <= 0) return NULL; } } } - if (addr[0] && strcmp(addr, "*")) { + if (!is_port && addr[0] && strcmp(addr, "*")) { unsigned short tmp; a.addr.bitlen = 32; @@ -2176,19 +2187,18 @@ void *parse_hostcond(char *addr, bool is_port) if (fam == AF_NETLINK) { a.addr.family = AF_NETLINK; a.addr.bitlen = 0; - port = strchr(addr, ':'); + port = find_port(addr, is_port); if (port) { - *port = 0; - if (port[1] && strcmp(port+1, "*")) { - if (get_integer(&a.port, port+1, 0)) { - if (strcmp(port+1, "kernel") == 0) + if (*port && strcmp(port, "*")) { + if (get_integer(&a.port, port, 0)) { + if (strcmp(port, "kernel") == 0) a.port = 0; else return NULL; } } } - if (addr[0] && strcmp(addr, "*")) { + if (!is_port && addr[0] && strcmp(addr, "*")) { a.addr.bitlen = 32; if (nl_proto_a2n(&a.addr.data[0], addr) == -1) return NULL; @@ -2201,21 +2211,13 @@ void *parse_hostcond(char *addr, bool is_port) a.addr.family = AF_VSOCK; - if (is_port) - port = addr; - else { - port = strchr(addr, ':'); - if (port) { - *port = '\0'; - port++; - } - } + port = find_port(addr, is_port); if (port && strcmp(port, "*") && get_u32((__u32 *)&a.port, port, 0)) return NULL; - if (addr[0] && strcmp(addr, "*")) { + if (!is_port && addr[0] && strcmp(addr, "*")) { a.addr.bitlen = 32; if (get_u32(&cid, addr, 0)) return NULL;