diff --git a/dcb/Makefile b/dcb/Makefile index 02d5d044..3a2e5d4c 100644 --- a/dcb/Makefile +++ b/dcb/Makefile @@ -13,6 +13,7 @@ DCBOBJ = dcb.o \ dcb_maxrate.o \ dcb_pfc.o TARGETS += dcb +LDLIBS += -lm endif diff --git a/include/json_print.h b/include/json_print.h index 1a1ad5ff..6fcf9fd9 100644 --- a/include/json_print.h +++ b/include/json_print.h @@ -15,6 +15,9 @@ #include "json_writer.h" #include "color.h" +#define _IS_JSON_CONTEXT(type) (is_json_context() && (type & PRINT_JSON || type & PRINT_ANY)) +#define _IS_FP_CONTEXT(type) (!is_json_context() && (type & PRINT_FP || type & PRINT_ANY)) + json_writer_t *get_json_writer(void); /* diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1daeda13..9c135426 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1656,22 +1656,30 @@ union bpf_attr { * networking traffic statistics as it provides a global socket * identifier that can be assumed unique. * Return - * A 8-byte long non-decreasing number on success, or 0 if the - * socket field is missing inside *skb*. + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. * * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return @@ -2231,6 +2239,9 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + * * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. @@ -3836,6 +3847,69 @@ union bpf_attr { * Return * A pointer to a struct socket on success or NULL if the file is * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + + * Check ctx packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing an *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, why it is not considered a + * failure. Other BPF-helpers are needed for performing the + * planned size change, why the responsability for catch a negative + * packet size belong in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TX length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. On input *mtu_len* must be a valid + * pointer and be initialized (to zero), else verifier will reject + * BPF program. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4001,6 +4075,7 @@ union bpf_attr { FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ FN(sock_from_file), \ + FN(check_mtu), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4501,6 +4576,7 @@ struct bpf_prog_info { __aligned_u64 prog_tags; __u64 run_time_ns; __u64 run_cnt; + __u64 recursion_misses; } __attribute__((aligned(8))); struct bpf_map_info { @@ -4981,9 +5057,13 @@ struct bpf_fib_lookup { __be16 sport; __be16 dport; - /* total length of packet from network header - used for MTU check */ - __u16 tot_len; + union { /* used for MTU check */ + /* input to lookup */ + __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ + /* output: MTU value */ + __u16 mtu_result; + }; /* input: L3 device index for lookup * output: device index from FIB lookup */ @@ -5029,6 +5109,17 @@ struct bpf_redir_neigh { }; }; +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 58ce1c6a..c3e40165 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -103,6 +103,8 @@ struct mptcp_info { __u64 mptcpi_write_seq; __u64 mptcpi_snd_una; __u64 mptcpi_rcv_nxt; + __u8 mptcpi_local_addr_used; + __u8 mptcpi_local_addr_max; }; /* diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index afe6836e..7ea59cfe 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -593,6 +593,7 @@ enum { TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */ TCA_FLOWER_KEY_CT_FLAGS_REPLY = 1 << 5, /* Packet is in the reply direction. */ + __TCA_FLOWER_KEY_CT_FLAGS_MAX, }; enum { diff --git a/include/version.h b/include/version.h index cf7afbbf..1a1f4f83 100644 --- a/include/version.h +++ b/include/version.h @@ -1 +1 @@ -static const char version[] = "5.10.0"; +static const char version[] = "5.11.0"; diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 571346b1..0bbcee2b 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -922,6 +922,7 @@ int print_linkinfo(struct nlmsghdr *n, void *arg) const char *name; unsigned int m_flag = 0; SPRINT_BUF(b1); + bool truncated_vfs = false; if (n->nlmsg_type != RTM_NEWLINK && n->nlmsg_type != RTM_DELLINK) return 0; @@ -1199,15 +1200,18 @@ int print_linkinfo(struct nlmsghdr *n, void *arg) if ((do_link || show_details) && tb[IFLA_VFINFO_LIST] && tb[IFLA_NUM_VF]) { struct rtattr *i, *vflist = tb[IFLA_VFINFO_LIST]; - int rem = RTA_PAYLOAD(vflist); + int rem = RTA_PAYLOAD(vflist), count = 0; open_json_array(PRINT_JSON, "vfinfo_list"); for (i = RTA_DATA(vflist); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) { open_json_object(NULL); print_vfinfo(fp, ifi, i); close_json_object(); + count++; } close_json_array(PRINT_JSON, NULL); + if (count != rta_getattr_u32(tb[IFLA_NUM_VF])) + truncated_vfs = true; } if (tb[IFLA_PROP_LIST]) { @@ -1228,6 +1232,9 @@ int print_linkinfo(struct nlmsghdr *n, void *arg) print_string(PRINT_FP, NULL, "%s", "\n"); fflush(fp); + /* prettier here if stderr and stdout go to the same place */ + if (truncated_vfs) + fprintf(stderr, "Truncated VF list: %s\n", name); return 1; } diff --git a/ip/iplink_bareudp.c b/ip/iplink_bareudp.c index 860ec699..aa311106 100644 --- a/ip/iplink_bareudp.c +++ b/ip/iplink_bareudp.c @@ -22,9 +22,11 @@ static void print_explain(FILE *f) " [ srcportmin PORT ]\n" " [ [no]multiproto ]\n" "\n" - "Where: PORT := 0-65535\n" - " PROTO := NUMBER | ip | mpls\n" - " SRCPORTMIN := 0-65535\n" + "Where: PORT := UDP_PORT\n" + " PROTO := ETHERTYPE\n" + "\n" + "Note: ETHERTYPE can be given as number or as protocol name (\"ipv4\", \"ipv6\",\n" + " \"mpls_uc\", etc.).\n" ); } diff --git a/ip/iproute.c b/ip/iproute.c index a8c4886b..291f1a58 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -2071,7 +2071,18 @@ static int iproute_get(int argc, char **argv) if (addr.bytelen) addattr_l(&req.n, sizeof(req), RTA_DST, &addr.data, addr.bytelen); - req.r.rtm_dst_len = addr.bitlen; + if (req.r.rtm_family == AF_INET && addr.bitlen != 32) { + fprintf(stderr, + "Warning: /%u as prefix is invalid, only /32 (or none) is supported.\n", + addr.bitlen); + req.r.rtm_dst_len = 32; + } else if (req.r.rtm_family == AF_INET6 && addr.bitlen != 128) { + fprintf(stderr, + "Warning: /%u as prefix is invalid, only /128 (or none) is supported.\n", + addr.bitlen); + req.r.rtm_dst_len = 128; + } else + req.r.rtm_dst_len = addr.bitlen; address_found = true; } argc--; argv++; diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c index 1ab95cd2..566fc7ea 100644 --- a/ip/iproute_lwtunnel.c +++ b/ip/iproute_lwtunnel.c @@ -891,13 +891,15 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp, NEXT_ARG(); if (table_ok++) duparg2("table", *argv); - rtnl_rttable_a2n(&table, *argv); + if (rtnl_rttable_a2n(&table, *argv)) + invarg("invalid table id\n", *argv); ret = rta_addattr32(rta, len, SEG6_LOCAL_TABLE, table); } else if (strcmp(*argv, "vrftable") == 0) { NEXT_ARG(); if (vrftable_ok++) duparg2("vrftable", *argv); - rtnl_rttable_a2n(&vrftable, *argv); + if (rtnl_rttable_a2n(&vrftable, *argv)) + invarg("invalid vrf table id\n", *argv); ret = rta_addattr32(rta, len, SEG6_LOCAL_VRFTABLE, vrftable); } else if (strcmp(*argv, "nh4") == 0) { diff --git a/ip/ipvrf.c b/ip/ipvrf.c index 42779e5c..91578031 100644 --- a/ip/ipvrf.c +++ b/ip/ipvrf.c @@ -278,8 +278,8 @@ static int vrf_configure_cgroup(const char *path, int ifindex) */ prog_fd = prog_load(ifindex); if (prog_fd < 0) { - fprintf(stderr, "Failed to load BPF prog: '%s'\n", - strerror(errno)); + fprintf(stderr, "Failed to load BPF prog: '%s'\n%s", + strerror(errno), bpf_log_buf); if (errno != EPERM) { fprintf(stderr, diff --git a/lib/Makefile b/lib/Makefile index 764c9137..6c98f9a6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -3,8 +3,8 @@ include ../config.mk CFLAGS += -fPIC -UTILOBJ = utils.o rt_names.o ll_map.o ll_types.o ll_proto.o ll_addr.o \ - inet_proto.o namespace.o json_writer.o json_print.o \ +UTILOBJ = utils.o utils_math.o rt_names.o ll_map.o ll_types.o ll_proto.o ll_addr.o \ + inet_proto.o namespace.o json_writer.o json_print.o json_print_math.o \ names.o color.o bpf_legacy.o bpf_glue.o exec.o fs.o cg_map.o ifeq ($(HAVE_ELF),y) diff --git a/lib/bpf_glue.c b/lib/bpf_glue.c index fa609bfe..d00a0dc1 100644 --- a/lib/bpf_glue.c +++ b/lib/bpf_glue.c @@ -14,7 +14,8 @@ int bpf_program_load(enum bpf_prog_type type, const struct bpf_insn *insns, size_t size_log) { #ifdef HAVE_LIBBPF - return bpf_load_program(type, insns, size_insns, license, 0, log, size_log); + return bpf_load_program(type, insns, size_insns / sizeof(struct bpf_insn), + license, 0, log, size_log); #else return bpf_prog_load_dev(type, insns, size_insns, license, 0, log, size_log); #endif diff --git a/lib/bpf_legacy.c b/lib/bpf_legacy.c index bc869c3f..8a03b9c2 100644 --- a/lib/bpf_legacy.c +++ b/lib/bpf_legacy.c @@ -510,20 +510,14 @@ static int bpf_mnt_fs(const char *target) static int bpf_mnt_check_target(const char *target) { - struct stat sb = {}; int ret; - ret = stat(target, &sb); - if (ret) { - ret = mkdir(target, S_IRWXU); - if (ret) { - fprintf(stderr, "mkdir %s failed: %s\n", target, - strerror(errno)); - return ret; - } - } + ret = mkdir(target, S_IRWXU); + if (ret && errno != EEXIST) + fprintf(stderr, "mkdir %s failed: %s\n", target, + strerror(errno)); - return 0; + return ret; } static int bpf_valid_mntpt(const char *mnt, unsigned long magic) diff --git a/lib/fs.c b/lib/fs.c index 4b90a704..ee0b130b 100644 --- a/lib/fs.c +++ b/lib/fs.c @@ -157,7 +157,8 @@ __u64 get_cgroup2_id(const char *path) memcpy(cg_id.bytes, fhp->f_handle, sizeof(__u64)); out: - close(mnt_fd); + if (mnt_fd >= 0) + close(mnt_fd); free(mnt); return cg_id.id; @@ -179,16 +180,16 @@ char *get_cgroup2_path(__u64 id, bool full) char *path = NULL; char fd_path[64]; int link_len; - char *mnt; + char *mnt = NULL; if (!id) { fprintf(stderr, "Invalid cgroup2 ID\n"); - return NULL; + goto out; } mnt = find_cgroup2_mount(false); if (!mnt) - return NULL; + goto out; mnt_fd = open(mnt, O_RDONLY); if (mnt_fd < 0) { @@ -225,8 +226,10 @@ char *get_cgroup2_path(__u64 id, bool full) "Failed to allocate memory for cgroup2 path\n"); out: - close(fd); - close(mnt_fd); + if (fd >= 0) + close(fd); + if (mnt_fd >= 0) + close(mnt_fd); free(mnt); return path; @@ -253,7 +256,7 @@ int make_path(const char *path, mode_t mode) *delim = '\0'; rc = mkdir(dir, mode); - if (mkdir(dir, mode) != 0 && errno != EEXIST) { + if (rc && errno != EEXIST) { fprintf(stderr, "mkdir failed for %s: %s\n", dir, strerror(errno)); goto out; diff --git a/lib/json_print.c b/lib/json_print.c index b086123a..994a2f8d 100644 --- a/lib/json_print.c +++ b/lib/json_print.c @@ -11,16 +11,12 @@ #include #include -#include #include "utils.h" #include "json_print.h" static json_writer_t *_jw; -#define _IS_JSON_CONTEXT(type) ((type & PRINT_JSON || type & PRINT_ANY) && _jw) -#define _IS_FP_CONTEXT(type) (!_jw && (type & PRINT_FP || type & PRINT_ANY)) - static void __new_json_obj(int json, bool have_array) { if (json) { @@ -342,32 +338,3 @@ int print_color_rate(bool use_iec, enum output_type type, enum color_attr color, free(buf); return rc; } - -char *sprint_size(__u32 sz, char *buf) -{ - long kilo = 1024; - long mega = kilo * kilo; - size_t len = SPRINT_BSIZE - 1; - double tmp = sz; - - if (sz >= mega && fabs(mega * rint(tmp / mega) - sz) < 1024) - snprintf(buf, len, "%gMb", rint(tmp / mega)); - else if (sz >= kilo && fabs(kilo * rint(tmp / kilo) - sz) < 16) - snprintf(buf, len, "%gKb", rint(tmp / kilo)); - else - snprintf(buf, len, "%ub", sz); - - return buf; -} - -int print_color_size(enum output_type type, enum color_attr color, - const char *key, const char *fmt, __u32 sz) -{ - SPRINT_BUF(buf); - - if (_IS_JSON_CONTEXT(type)) - return print_color_uint(type, color, key, "%u", sz); - - sprint_size(sz, buf); - return print_color_string(type, color, key, fmt, buf); -} diff --git a/lib/json_print_math.c b/lib/json_print_math.c new file mode 100644 index 00000000..f4d50499 --- /dev/null +++ b/lib/json_print_math.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include + +#include "utils.h" +#include "json_print.h" + +char *sprint_size(__u32 sz, char *buf) +{ + long kilo = 1024; + long mega = kilo * kilo; + size_t len = SPRINT_BSIZE - 1; + double tmp = sz; + + if (sz >= mega && fabs(mega * rint(tmp / mega) - sz) < 1024) + snprintf(buf, len, "%gMb", rint(tmp / mega)); + else if (sz >= kilo && fabs(kilo * rint(tmp / kilo) - sz) < 16) + snprintf(buf, len, "%gKb", rint(tmp / kilo)); + else + snprintf(buf, len, "%ub", sz); + + return buf; +} + +int print_color_size(enum output_type type, enum color_attr color, + const char *key, const char *fmt, __u32 sz) +{ + SPRINT_BUF(buf); + + if (_IS_JSON_CONTEXT(type)) + return print_color_uint(type, color, key, "%u", sz); + + sprint_size(sz, buf); + return print_color_string(type, color, key, fmt, buf); +} diff --git a/lib/namespace.c b/lib/namespace.c index 06ae0a48..45a7dedd 100644 --- a/lib/namespace.c +++ b/lib/namespace.c @@ -122,8 +122,14 @@ int netns_foreach(int (*func)(char *nsname, void *arg), void *arg) struct dirent *entry; dir = opendir(NETNS_RUN_DIR); - if (!dir) + if (!dir) { + if (errno == ENOENT) + return 0; + + fprintf(stderr, "Failed to open directory %s: %s\n", + NETNS_RUN_DIR, strerror(errno)); return -1; + } while ((entry = readdir(dir)) != NULL) { if (strcmp(entry->d_name, ".") == 0) diff --git a/lib/utils.c b/lib/utils.c index 633f6359..93ae0c55 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -513,120 +513,6 @@ int get_addr64(__u64 *ap, const char *cp) return 1; } -/* See http://physics.nist.gov/cuu/Units/binary.html */ -static const struct rate_suffix { - const char *name; - double scale; -} suffixes[] = { - { "bit", 1. }, - { "Kibit", 1024. }, - { "kbit", 1000. }, - { "mibit", 1024.*1024. }, - { "mbit", 1000000. }, - { "gibit", 1024.*1024.*1024. }, - { "gbit", 1000000000. }, - { "tibit", 1024.*1024.*1024.*1024. }, - { "tbit", 1000000000000. }, - { "Bps", 8. }, - { "KiBps", 8.*1024. }, - { "KBps", 8000. }, - { "MiBps", 8.*1024*1024. }, - { "MBps", 8000000. }, - { "GiBps", 8.*1024.*1024.*1024. }, - { "GBps", 8000000000. }, - { "TiBps", 8.*1024.*1024.*1024.*1024. }, - { "TBps", 8000000000000. }, - { NULL } -}; - -int get_rate(unsigned int *rate, const char *str) -{ - char *p; - double bps = strtod(str, &p); - const struct rate_suffix *s; - - if (p == str) - return -1; - - for (s = suffixes; s->name; ++s) { - if (strcasecmp(s->name, p) == 0) { - bps *= s->scale; - p += strlen(p); - break; - } - } - - if (*p) - return -1; /* unknown suffix */ - - bps /= 8; /* -> bytes per second */ - *rate = bps; - /* detect if an overflow happened */ - if (*rate != floor(bps)) - return -1; - return 0; -} - -int get_rate64(__u64 *rate, const char *str) -{ - char *p; - double bps = strtod(str, &p); - const struct rate_suffix *s; - - if (p == str) - return -1; - - for (s = suffixes; s->name; ++s) { - if (strcasecmp(s->name, p) == 0) { - bps *= s->scale; - p += strlen(p); - break; - } - } - - if (*p) - return -1; /* unknown suffix */ - - bps /= 8; /* -> bytes per second */ - *rate = bps; - return 0; -} - -int get_size(unsigned int *size, const char *str) -{ - double sz; - char *p; - - sz = strtod(str, &p); - if (p == str) - return -1; - - if (*p) { - if (strcasecmp(p, "kb") == 0 || strcasecmp(p, "k") == 0) - sz *= 1024; - else if (strcasecmp(p, "gb") == 0 || strcasecmp(p, "g") == 0) - sz *= 1024*1024*1024; - else if (strcasecmp(p, "gbit") == 0) - sz *= 1024*1024*1024/8; - else if (strcasecmp(p, "mb") == 0 || strcasecmp(p, "m") == 0) - sz *= 1024*1024; - else if (strcasecmp(p, "mbit") == 0) - sz *= 1024*1024/8; - else if (strcasecmp(p, "kbit") == 0) - sz *= 1024/8; - else if (strcasecmp(p, "b") != 0) - return -1; - } - - *size = sz; - - /* detect if an overflow happened */ - if (*size != floor(sz)) - return -1; - - return 0; -} - static void set_address_type(inet_prefix *addr) { switch (addr->family) { diff --git a/lib/utils_math.c b/lib/utils_math.c new file mode 100644 index 00000000..9ef3dd6e --- /dev/null +++ b/lib/utils_math.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include +#include + +#include "utils.h" + +/* See http://physics.nist.gov/cuu/Units/binary.html */ +static const struct rate_suffix { + const char *name; + double scale; +} suffixes[] = { + { "bit", 1. }, + { "Kibit", 1024. }, + { "kbit", 1000. }, + { "mibit", 1024.*1024. }, + { "mbit", 1000000. }, + { "gibit", 1024.*1024.*1024. }, + { "gbit", 1000000000. }, + { "tibit", 1024.*1024.*1024.*1024. }, + { "tbit", 1000000000000. }, + { "Bps", 8. }, + { "KiBps", 8.*1024. }, + { "KBps", 8000. }, + { "MiBps", 8.*1024*1024. }, + { "MBps", 8000000. }, + { "GiBps", 8.*1024.*1024.*1024. }, + { "GBps", 8000000000. }, + { "TiBps", 8.*1024.*1024.*1024.*1024. }, + { "TBps", 8000000000000. }, + { NULL } +}; + +int get_rate(unsigned int *rate, const char *str) +{ + char *p; + double bps = strtod(str, &p); + const struct rate_suffix *s; + + if (p == str) + return -1; + + for (s = suffixes; s->name; ++s) { + if (strcasecmp(s->name, p) == 0) { + bps *= s->scale; + p += strlen(p); + break; + } + } + + if (*p) + return -1; /* unknown suffix */ + + bps /= 8; /* -> bytes per second */ + *rate = bps; + /* detect if an overflow happened */ + if (*rate != floor(bps)) + return -1; + return 0; +} + +int get_rate64(__u64 *rate, const char *str) +{ + char *p; + double bps = strtod(str, &p); + const struct rate_suffix *s; + + if (p == str) + return -1; + + for (s = suffixes; s->name; ++s) { + if (strcasecmp(s->name, p) == 0) { + bps *= s->scale; + p += strlen(p); + break; + } + } + + if (*p) + return -1; /* unknown suffix */ + + bps /= 8; /* -> bytes per second */ + *rate = bps; + return 0; +} + +int get_size(unsigned int *size, const char *str) +{ + double sz; + char *p; + + sz = strtod(str, &p); + if (p == str) + return -1; + + if (*p) { + if (strcasecmp(p, "kb") == 0 || strcasecmp(p, "k") == 0) + sz *= 1024; + else if (strcasecmp(p, "gb") == 0 || strcasecmp(p, "g") == 0) + sz *= 1024*1024*1024; + else if (strcasecmp(p, "gbit") == 0) + sz *= 1024*1024*1024/8; + else if (strcasecmp(p, "mb") == 0 || strcasecmp(p, "m") == 0) + sz *= 1024*1024; + else if (strcasecmp(p, "mbit") == 0) + sz *= 1024*1024/8; + else if (strcasecmp(p, "kbit") == 0) + sz *= 1024/8; + else if (strcasecmp(p, "b") != 0) + return -1; + } + + *size = sz; + + /* detect if an overflow happened */ + if (*size != floor(sz)) + return -1; + + return 0; +} diff --git a/man/man8/bridge.8 b/man/man8/bridge.8 index b3414ae3..9d8663bd 100644 --- a/man/man8/bridge.8 +++ b/man/man8/bridge.8 @@ -397,7 +397,8 @@ bridge FDB. .TP .BR "flood on " or " flood off " -Controls whether a given port will flood unicast traffic for which there is no FDB entry. By default this flag is on. +Controls whether unicast traffic for which there is no FDB entry will be +flooded towards this given port. By default this flag is on. .TP .B hwmode @@ -413,8 +414,8 @@ switch. .TP .BR "mcast_flood on " or " mcast_flood off " -Controls whether a given port will flood multicast traffic for which -there is no MDB entry. By default this flag is on. +Controls whether multicast traffic for which there is no MDB entry will be +flooded towards this given port. By default this flag is on. .TP .BR "mcast_to_unicast on " or " mcast_to_unicast off " @@ -514,7 +515,14 @@ the Ethernet MAC address. the interface to which this address is associated. .B local -- is a local permanent fdb entry +- is a local permanent fdb entry, which means that the bridge will not forward +frames with this destination MAC address and VLAN ID, but terminate them +locally. This flag is default unless "static" or "dynamic" are explicitly +specified. +.sp + +.B permanent +- this is a synonym for "local" .sp .B static @@ -526,11 +534,21 @@ the interface to which this address is associated. .sp .B self -- the address is associated with the port drivers fdb. Usually hardware. +- the operation is fulfilled directly by the driver for the specified network +device. If the network device belongs to a master like a bridge, then the +bridge is bypassed and not notified of this operation (and if the device does +notify the bridge, it is driver-specific behavior and not mandated by this +flag, check the driver for more details). The "bridge fdb add" command can also +be used on the bridge device itself, and in this case, the added fdb entries +will be locally terminated (not forwarded). In the latter case, the "self" flag +is mandatory. The flag is set by default if "master" is not specified. .sp .B master -- the address is associated with master devices fdb. Usually software (default). +- if the specified network device is a port that belongs to a master device +such as a bridge, the operation is fulfilled by the master device's driver, +which may in turn notify the port driver too of the address. If the specified +device is a master itself, such as a bridge, this flag is invalid. .sp .B router diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 3516765a..fd67e611 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -1307,9 +1307,9 @@ For a link of type the following additional arguments are supported: .BI "ip link add " DEVICE -.BI type " bareudp " dstport " PORT " ethertype " ETHERTYPE" +.BI type " bareudp " dstport " PORT " ethertype " PROTO" [ -.BI srcportmin " SRCPORTMIN " +.BI srcportmin " PORT " ] [ .RB [ no ] multiproto ] @@ -1320,11 +1320,14 @@ the following additional arguments are supported: - specifies the destination port for the UDP tunnel. .sp -.BI ethertype " ETHERTYPE" +.BI ethertype " PROTO" - specifies the ethertype of the L3 protocol being tunnelled. +.B ethertype +can be given as plain Ethernet protocol number or using the protocol name +("ipv4", "ipv6", "mpls_uc", etc.). .sp -.BI srcportmin " SRCPORTMIN" +.BI srcportmin " PORT" - selects the lowest value of the UDP tunnel source port range. .sp @@ -1332,11 +1335,11 @@ the following additional arguments are supported: - activates support for protocols similar to the one .RB "specified by " ethertype . When -.I ETHERTYPE +.B ethertype is "mpls_uc" (that is, unicast MPLS), this allows the tunnel to also handle multicast MPLS. When -.I ETHERTYPE +.B ethertype is "ipv4", this allows the tunnel to also handle IPv6. This option is disabled by default. diff --git a/man/man8/ss.8 b/man/man8/ss.8 index e4b9cdcb..42aac6de 100644 --- a/man/man8/ss.8 +++ b/man/man8/ss.8 @@ -440,6 +440,113 @@ states except for - opposite to .B bucket +.SH EXPRESSION + +.B EXPRESSION +allows filtering based on specific criteria. +.B EXPRESSION +consists of a series of predicates combined by boolean operators. The possible operators in increasing +order of precedence are +.B or +(or | or ||), +.B and +(or & or &&), and +.B not +(or !). If no operator is between consecutive predicates, an implicit +.B and +operator is assumed. Subexpressions can be grouped with "(" and ")". +.P +The following predicates are supported: + +.TP +.B {dst|src} [=] HOST +Test if the destination or source matches HOST. See HOST SYNTAX for details. +.TP +.B {dport|sport} [OP] [FAMILY:]:PORT +Compare the destination or source port to PORT. OP can be any of "<", "<=", "=", "!=", +">=" and ">". Following normal arithmetic rules. FAMILY and PORT are as described in +HOST SYNTAX below. +.TP +.B dev [=|!=] DEVICE +Match based on the device the connection uses. DEVICE can either be a device name or the +index of the interface. +.TP +.B fwmark [=|!=] MASK +Matches based on the fwmark value for the connection. This can either be a specific mark value +or a mark value followed by a "/" and a bitmask of which bits to use in the comparison. For example +"fwmark = 0x01/0x03" would match if the two least significant bits of the fwmark were 0x01. +.TP +.B cgroup [=|!=] PATH +Match if the connection is part of a cgroup at the given path. +.TP +.B autobound +Match if the port or path of the source address was automatically allocated +(rather than explicitly specified). +.P +Most operators have aliases. If no operator is supplied "=" is assumed. +Each of the following groups of operators are all equivalent: +.RS +.IP \(bu 2 += == eq +.IP \(bu +!= ne neq +.IP \(bu +> gt +.IP \(bu +< lt +.IP \(bu +>= ge geq +.IP \(bu +<= le leq +.IP \(bu +! not +.IP \(bu +| || or +.IP \(bu +& && and +.RE +.SH HOST SYNTAX +.P +The general host syntax is [FAMILY:]ADDRESS[:PORT]. +.P +FAMILY must be one of the families supported by the -f option. If not given +it defaults to the family given with the -f option, and if that is also +missing, will assume either inet or inet6. Note that all host conditions in the +expression should either all be the same family or be only inet and inet6. If there +is some other mixture of families, the results will probably be unexpected. +.P +The form of ADDRESS and PORT depends on the family used. "*" can be used as +a wildcard for either the address or port. The details for each family are as +follows: +.TP +.B unix +ADDRESS is a glob pattern (see +.BR fnmatch (3)) +that will be matched case-insensitively against the unix socket's address. Both path and abstract +names are supported. Unix addresses do not support a port, and "*" cannot be used as a wildcard. +.TP +.B link +ADDRESS is the case-insensitive name of an Ethernet protocol to match. PORT +is either a device name or a device index for the desired link device, as seen +in the output of ip link. +.TP +.B netlink +ADDRESS is a descriptor of the netlink family. Possible values come from +/etc/iproute2/nl_protos. PORT is the port id of the socket, which is usually +the same as the owning process id. The value "kernel" can be used to represent +the kernel (port id of 0). +.TP +.B vsock +ADDRESS is an integer representing the CID address, and PORT is the port. +.TP +.BR inet \ and\ inet6 +ADDRESS is an ip address (either v4 or v6 depending on the family) or a DNS +hostname that resolves to an ip address of the required version. An ipv6 +address must be enclosed in "[" and "]" to disambiguate the port separator. The +address may additionally have a prefix length given in CIDR notation (a slash +followed by the prefix length in bits). PORT is either the numerical +socket port, or the service name for the port to match. + .SH USAGE EXAMPLES .TP .B ss -t -a diff --git a/man/man8/tc-taprio.8 b/man/man8/tc-taprio.8 index e1d19ba1..d13c86f7 100644 --- a/man/man8/tc-taprio.8 +++ b/man/man8/tc-taprio.8 @@ -92,7 +92,11 @@ in the schedule; clockid .br Specifies the clock to be used by qdisc's internal timer for measuring -time and scheduling events. +time and scheduling events. This argument must be omitted when using the +full-offload feature (flags 0x2), since in that case, the clockid is +implicitly /dev/ptpN (where N is given by +.B ethtool -T eth0 | grep 'PTP Hardware Clock' +), and therefore not necessarily synchronized with the system's CLOCK_TAI. .TP sched-entry @@ -115,13 +119,27 @@ before moving to the next entry. .TP flags .br -Specifies different modes for taprio. Currently, only txtime-assist is -supported which can be enabled by setting it to 0x1. In this mode, taprio will -set the transmit timestamp depending on the interval in which the packet needs -to be transmitted. It will then utililize the +This is a bit mask which specifies different modes for taprio. +.RS +.TP +.I 0x1 +Enables the txtime-assist feature. In this mode, taprio will set the transmit +timestamp depending on the interval in which the packet needs to be +transmitted. It will then utililize the .BR etf(8) qdisc to sort and transmit the packets at the right time. The second example can be used as a reference to configure this mode. +.TP +.I 0x2 +Enables the full-offload feature. In this mode, taprio will pass the gate +control list to the NIC which will execute it cyclically in hardware. +When using full-offload, there is no need to specify the +.B clockid +argument. + +The txtime-assist and full-offload features are mutually exclusive, i.e. +setting flags to 0x3 is invalid. +.RE .TP txtime-delay @@ -178,5 +196,28 @@ for more information about configuring the ETF qdisc. offload delta 200000 clockid CLOCK_TAI .EE +The following is a schedule in full offload mode. The +.B base-time +is 200 ns and the +.B cycle-time +is implicitly calculated as the sum of all +.B sched-entry +durations (i.e. 20 us + 20 us + 60 us = 100 us). Although the base-time is in +the past, the hardware will start executing the schedule at a PTP time equal to +the smallest integer multiple of 100 us, plus 200 ns, that is larger than the +NIC's current PTP time. + +.EX +# tc qdisc add dev eth0 parent root taprio \\ + num_tc 8 \\ + map 0 1 2 3 4 5 6 7 \\ + queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \\ + base-time 200 \\ + sched-entry S 80 20000 \\ + sched-entry S a0 20000 \\ + sched-entry S df 60000 \\ + flags 0x2 +.EE + .SH AUTHORS Vinicius Costa Gomes diff --git a/misc/ss.c b/misc/ss.c index 5c934fa0..894ad405 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -3404,7 +3404,7 @@ static int tcpdiag_send(int fd, int protocol, struct filter *f) struct iovec iov[3]; int iovlen = 1; - if (protocol == IPPROTO_UDP) + if (protocol == IPPROTO_UDP || protocol == IPPROTO_MPTCP) return -1; if (protocol == IPPROTO_TCP) @@ -3623,6 +3623,14 @@ static int inet_show_netlink(struct filter *f, FILE *dump_fp, int protocol) if (preferred_family == PF_INET6) family = PF_INET6; + /* extended protocol will use INET_DIAG_REQ_PROTOCOL, + * not supported by older kernels. On such kernel + * rtnl_dump will bail with rtnl_dump_error(). + * Suppress the error to avoid confusing the user + */ + if (protocol > 255) + rth.flags |= RTNL_HANDLE_F_SUPPRESS_NLERR; + again: if ((err = sockdiag_send(family, rth.fd, protocol, f))) goto Exit; diff --git a/rdma/rdma.h b/rdma/rdma.h index 470e11c8..8b421db8 100644 --- a/rdma/rdma.h +++ b/rdma/rdma.h @@ -85,6 +85,7 @@ struct rd_cmd { * Parser interface */ bool rd_no_arg(struct rd *rd); +bool rd_is_multiarg(struct rd *rd); void rd_arg_inc(struct rd *rd); char *rd_argv(struct rd *rd); diff --git a/rdma/stat.c b/rdma/stat.c index a2b5da1c..75d45288 100644 --- a/rdma/stat.c +++ b/rdma/stat.c @@ -502,6 +502,12 @@ static int stat_get_arg(struct rd *rd, const char *arg) return -EINVAL; rd_arg_inc(rd); + + if (rd_is_multiarg(rd)) { + pr_err("The parameter %s shouldn't include range\n", arg); + return -EINVAL; + } + value = strtol(rd_argv(rd), &endp, 10); rd_arg_inc(rd); @@ -523,6 +529,8 @@ static int stat_one_qp_bind(struct rd *rd) return ret; lqpn = stat_get_arg(rd, "lqpn"); + if (lqpn < 0) + return lqpn; rd_prepare_msg(rd, RDMA_NLDEV_CMD_STAT_SET, &seq, (NLM_F_REQUEST | NLM_F_ACK)); @@ -537,6 +545,9 @@ static int stat_one_qp_bind(struct rd *rd) if (rd_argc(rd)) { cntn = stat_get_arg(rd, "cntn"); + if (cntn < 0) + return cntn; + mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn); } @@ -607,13 +618,23 @@ static int stat_one_qp_unbind(struct rd *rd) unsigned int portid; uint32_t seq; + if (rd_no_arg(rd)) { + stat_help(rd); + return -EINVAL; + } + ret = rd_build_filter(rd, stat_valid_filters); if (ret) return ret; cntn = stat_get_arg(rd, "cntn"); + if (cntn < 0) + return cntn; + if (rd_argc(rd)) { lqpn = stat_get_arg(rd, "lqpn"); + if (lqpn < 0) + return lqpn; return do_stat_qp_unbind_lqpn(rd, cntn, lqpn); } diff --git a/rdma/utils.c b/rdma/utils.c index 903a544c..292e1808 100644 --- a/rdma/utils.c +++ b/rdma/utils.c @@ -47,6 +47,13 @@ bool rd_no_arg(struct rd *rd) return rd_argc(rd) == 0; } +bool rd_is_multiarg(struct rd *rd) +{ + if (!rd_argc(rd)) + return false; + return strpbrk(rd_argv(rd), ",-") != NULL; +} + /* * Possible input:output * dev/port | first port | is_dump_all diff --git a/tc/m_gate.c b/tc/m_gate.c index 892775a3..c091ae19 100644 --- a/tc/m_gate.c +++ b/tc/m_gate.c @@ -427,7 +427,7 @@ static int print_gate_list(struct rtattr *list) __u32 index = 0, interval = 0; __u8 gate_state = 0; __s32 ipv = -1, maxoctets = -1; - char buf[22]; + SPRINT_BUF(buf); parse_rtattr_nested(tb, TCA_GATE_ENTRY_MAX, item); @@ -490,7 +490,7 @@ static int print_gate(struct action_util *au, FILE *f, struct rtattr *arg) __s64 base_time = 0; __s64 cycle_time = 0; __s64 cycle_time_ext = 0; - char buf[22]; + SPRINT_BUF(buf); int prio = -1; if (arg == NULL) diff --git a/vdpa/.gitignore b/vdpa/.gitignore new file mode 100644 index 00000000..7ef28784 --- /dev/null +++ b/vdpa/.gitignore @@ -0,0 +1 @@ +vdpa