From 152594273608612604fe773a762bce2327427e32 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 3 Apr 2018 07:29:14 +0300 Subject: [PATCH 01/28] rdma: Print net device name and index for RDMA device The RDMA devices are operated in RoCE and iWARP modes have net device underneath. Present their names in regular output and their net index in detailed mode. [root@nps ~]# rdma link show mlx5_3/1 4/1: mlx5_3/1: state ACTIVE physical_state LINK_UP netdev ens7 [root@nps ~]# rdma link show mlx5_3/1 -d 4/1: mlx5_3/1: state ACTIVE physical_state LINK_UP netdev ens7 netdev_index 7 caps: Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: David Ahern --- rdma/include/uapi/rdma/rdma_netlink.h | 4 ++++ rdma/link.c | 21 +++++++++++++++++++++ rdma/utils.c | 2 ++ 3 files changed, 27 insertions(+) diff --git a/rdma/include/uapi/rdma/rdma_netlink.h b/rdma/include/uapi/rdma/rdma_netlink.h index 9446a721..45474f13 100644 --- a/rdma/include/uapi/rdma/rdma_netlink.h +++ b/rdma/include/uapi/rdma/rdma_netlink.h @@ -388,6 +388,10 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */ RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */ + /* Netdev information for relevant protocols, like RoCE and iWARP */ + RDMA_NLDEV_ATTR_NDEV_INDEX, /* u32 */ + RDMA_NLDEV_ATTR_NDEV_NAME, /* string */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _RDMA_NETLINK_H */ diff --git a/rdma/link.c b/rdma/link.c index 66bcd50e..7e914c87 100644 --- a/rdma/link.c +++ b/rdma/link.c @@ -205,6 +205,26 @@ static void link_print_phys_state(struct rd *rd, struct nlattr **tb) pr_out("physical_state %s ", phys_state_to_str(phys_state)); } +static void link_print_netdev(struct rd *rd, struct nlattr **tb) +{ + const char *netdev_name; + uint32_t idx; + + if (!tb[RDMA_NLDEV_ATTR_NDEV_NAME] || !tb[RDMA_NLDEV_ATTR_NDEV_INDEX]) + return; + + netdev_name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_NDEV_NAME]); + idx = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_NDEV_INDEX]); + if (rd->json_output) { + jsonw_string_field(rd->jw, "netdev", netdev_name); + jsonw_uint_field(rd->jw, "netdev_index", idx); + } else { + pr_out("netdev %s ", netdev_name); + if (rd->show_details) + pr_out("netdev_index %u ", idx); + } +} + static int link_parse_cb(const struct nlmsghdr *nlh, void *data) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {}; @@ -241,6 +261,7 @@ static int link_parse_cb(const struct nlmsghdr *nlh, void *data) link_print_lmc(rd, tb); link_print_state(rd, tb); link_print_phys_state(rd, tb); + link_print_netdev(rd, tb); if (rd->show_details) link_print_caps(rd, tb); diff --git a/rdma/utils.c b/rdma/utils.c index 5c1e736a..49c967f3 100644 --- a/rdma/utils.c +++ b/rdma/utils.c @@ -391,6 +391,8 @@ static const enum mnl_attr_data_type nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_LKEY] = MNL_TYPE_U32, [RDMA_NLDEV_ATTR_RES_IOVA] = MNL_TYPE_U64, [RDMA_NLDEV_ATTR_RES_MRLEN] = MNL_TYPE_U64, + [RDMA_NLDEV_ATTR_NDEV_INDEX] = MNL_TYPE_U32, + [RDMA_NLDEV_ATTR_NDEV_NAME] = MNL_TYPE_NUL_STRING, }; int rd_attr_cb(const struct nlattr *attr, void *data) From 1d3c91a7c49d2795c3e57a7bb114d81ba10e0eeb Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Tue, 3 Apr 2018 09:09:55 -0400 Subject: [PATCH 02/28] tc: jsonify connmark action Signed-off-by: Roman Mashak Signed-off-by: David Ahern --- tc/m_connmark.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tc/m_connmark.c b/tc/m_connmark.c index bcce4139..45e2d05f 100644 --- a/tc/m_connmark.c +++ b/tc/m_connmark.c @@ -114,16 +114,20 @@ static int print_connmark(struct action_util *au, FILE *f, struct rtattr *arg) parse_rtattr_nested(tb, TCA_CONNMARK_MAX, arg); if (tb[TCA_CONNMARK_PARMS] == NULL) { - fprintf(f, "[NULL connmark parameters]"); + print_string(PRINT_FP, NULL, "%s", "[NULL connmark parameters]"); return -1; } ci = RTA_DATA(tb[TCA_CONNMARK_PARMS]); - fprintf(f, " connmark zone %d", ci->zone); - print_action_control(f, " ", ci->action, "\n"); - fprintf(f, "\t index %u ref %d bind %d", ci->index, - ci->refcnt, ci->bindcnt); + print_string(PRINT_ANY, "kind", "%s ", "connmark"); + print_uint(PRINT_ANY, "zone", "zone %u", ci->zone); + print_action_control(f, " ", ci->action, ""); + + print_string(PRINT_FP, NULL, "%s", _SL_); + print_uint(PRINT_ANY, "index", "\t index %u", ci->index); + print_int(PRINT_ANY, "ref", " ref %d", ci->refcnt); + print_int(PRINT_ANY, "bind", " bind %d", ci->bindcnt); if (show_stats) { if (tb[TCA_CONNMARK_TM]) { @@ -132,7 +136,7 @@ static int print_connmark(struct action_util *au, FILE *f, struct rtattr *arg) print_tm(f, tm); } } - fprintf(f, "\n"); + print_string(PRINT_FP, NULL, "%s", _SL_); return 0; } From 8feb516bfcdd9e8417ffce89733220fd0e03e6a2 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 4 Apr 2018 13:21:18 -0400 Subject: [PATCH 03/28] tc: jsonify tunnel_key action Signed-off-by: Roman Mashak Signed-off-by: David Ahern --- tc/m_tunnel_key.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/tc/m_tunnel_key.c b/tc/m_tunnel_key.c index bac3c07f..0fa46154 100644 --- a/tc/m_tunnel_key.c +++ b/tc/m_tunnel_key.c @@ -221,7 +221,13 @@ static void tunnel_key_print_ip_addr(FILE *f, const char *name, else return; - fprintf(f, "\n\t%s %s", name, rt_addr_n2a_rta(family, attr)); + print_string(PRINT_FP, NULL, "%s", _SL_); + if (matches(name, "src_ip") == 0) + print_string(PRINT_ANY, "src_ip", "\tsrc_ip %s", + rt_addr_n2a_rta(family, attr)); + else if (matches(name, "dst_ip") == 0) + print_string(PRINT_ANY, "dst_ip", "\tdst_ip %s", + rt_addr_n2a_rta(family, attr)); } static void tunnel_key_print_key_id(FILE *f, const char *name, @@ -229,7 +235,8 @@ static void tunnel_key_print_key_id(FILE *f, const char *name, { if (!attr) return; - fprintf(f, "\n\t%s %d", name, rta_getattr_be32(attr)); + print_string(PRINT_FP, NULL, "%s", _SL_); + print_uint(PRINT_ANY, "key_id", "\tkey_id %u", rta_getattr_be32(attr)); } static void tunnel_key_print_dst_port(FILE *f, char *name, @@ -237,7 +244,9 @@ static void tunnel_key_print_dst_port(FILE *f, char *name, { if (!attr) return; - fprintf(f, "\n\t%s %d", name, rta_getattr_be16(attr)); + print_string(PRINT_FP, NULL, "%s", _SL_); + print_uint(PRINT_ANY, "dst_port", "\tdst_port %u", + rta_getattr_be16(attr)); } static void tunnel_key_print_flag(FILE *f, const char *name_on, @@ -246,7 +255,9 @@ static void tunnel_key_print_flag(FILE *f, const char *name_on, { if (!attr) return; - fprintf(f, "\n\t%s", rta_getattr_u8(attr) ? name_on : name_off); + print_string(PRINT_FP, NULL, "%s", _SL_); + print_string(PRINT_ANY, "flag", "\t%s", + rta_getattr_u8(attr) ? name_on : name_off); } static int print_tunnel_key(struct action_util *au, FILE *f, struct rtattr *arg) @@ -260,19 +271,20 @@ static int print_tunnel_key(struct action_util *au, FILE *f, struct rtattr *arg) parse_rtattr_nested(tb, TCA_TUNNEL_KEY_MAX, arg); if (!tb[TCA_TUNNEL_KEY_PARMS]) { - fprintf(f, "[NULL tunnel_key parameters]"); + print_string(PRINT_FP, NULL, "%s", + "[NULL tunnel_key parameters]"); return -1; } parm = RTA_DATA(tb[TCA_TUNNEL_KEY_PARMS]); - fprintf(f, "tunnel_key"); + print_string(PRINT_ANY, "kind", "%s ", "tunnel_key"); switch (parm->t_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: - fprintf(f, " unset"); + print_string(PRINT_ANY, "mode", " %s", "unset"); break; case TCA_TUNNEL_KEY_ACT_SET: - fprintf(f, " set"); + print_string(PRINT_ANY, "mode", " %s", "set"); tunnel_key_print_ip_addr(f, "src_ip", tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]); tunnel_key_print_ip_addr(f, "dst_ip", @@ -291,8 +303,10 @@ static int print_tunnel_key(struct action_util *au, FILE *f, struct rtattr *arg) } print_action_control(f, " ", parm->action, ""); - fprintf(f, "\n\tindex %d ref %d bind %d", parm->index, parm->refcnt, - parm->bindcnt); + print_string(PRINT_FP, NULL, "%s", _SL_); + print_uint(PRINT_ANY, "index", "\t index %u", parm->index); + print_int(PRINT_ANY, "ref", " ref %d", parm->refcnt); + print_int(PRINT_ANY, "bind", " bind %d", parm->bindcnt); if (show_stats) { if (tb[TCA_TUNNEL_KEY_TM]) { @@ -302,7 +316,7 @@ static int print_tunnel_key(struct action_util *au, FILE *f, struct rtattr *arg) } } - fprintf(f, "\n "); + print_string(PRINT_FP, NULL, "%s", _SL_); return 0; } From 7b17701717ee9f749f3325e7f830e8e5b84a507a Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Tue, 10 Apr 2018 14:04:29 -0400 Subject: [PATCH 04/28] tc: jsonify skbedit action v2: FIxed strings format in print_string() Signed-off-by: Roman Mashak Signed-off-by: David Ahern --- tc/m_skbedit.c | 53 +++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/tc/m_skbedit.c b/tc/m_skbedit.c index db5c64ca..7391fc7f 100644 --- a/tc/m_skbedit.c +++ b/tc/m_skbedit.c @@ -168,9 +168,8 @@ static int print_skbedit(struct action_util *au, FILE *f, struct rtattr *arg) struct rtattr *tb[TCA_SKBEDIT_MAX + 1]; SPRINT_BUF(b1); - __u32 *priority; - __u32 *mark; - __u16 *queue_mapping, *ptype; + __u32 priority; + __u16 ptype; struct tc_skbedit *p = NULL; if (arg == NULL) @@ -179,43 +178,49 @@ static int print_skbedit(struct action_util *au, FILE *f, struct rtattr *arg) parse_rtattr_nested(tb, TCA_SKBEDIT_MAX, arg); if (tb[TCA_SKBEDIT_PARMS] == NULL) { - fprintf(f, "[NULL skbedit parameters]"); + print_string(PRINT_FP, NULL, "%s", "[NULL skbedit parameters]"); return -1; } p = RTA_DATA(tb[TCA_SKBEDIT_PARMS]); - fprintf(f, " skbedit"); + print_string(PRINT_ANY, "kind", "%s ", "skbedit"); if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) { - queue_mapping = RTA_DATA(tb[TCA_SKBEDIT_QUEUE_MAPPING]); - fprintf(f, " queue_mapping %u", *queue_mapping); + print_uint(PRINT_ANY, "queue_mapping", "queue_mapping %u", + rta_getattr_u16(tb[TCA_SKBEDIT_QUEUE_MAPPING])); } if (tb[TCA_SKBEDIT_PRIORITY] != NULL) { - priority = RTA_DATA(tb[TCA_SKBEDIT_PRIORITY]); - fprintf(f, " priority %s", sprint_tc_classid(*priority, b1)); + priority = rta_getattr_u32(tb[TCA_SKBEDIT_PRIORITY]); + print_string(PRINT_ANY, "priority", " priority %s", + sprint_tc_classid(priority, b1)); } if (tb[TCA_SKBEDIT_MARK] != NULL) { - mark = RTA_DATA(tb[TCA_SKBEDIT_MARK]); - fprintf(f, " mark %d", *mark); + print_uint(PRINT_ANY, "mark", " mark %u", + rta_getattr_u32(tb[TCA_SKBEDIT_MARK])); } if (tb[TCA_SKBEDIT_PTYPE] != NULL) { - ptype = RTA_DATA(tb[TCA_SKBEDIT_PTYPE]); - if (*ptype == PACKET_HOST) - fprintf(f, " ptype host"); - else if (*ptype == PACKET_BROADCAST) - fprintf(f, " ptype broadcast"); - else if (*ptype == PACKET_MULTICAST) - fprintf(f, " ptype multicast"); - else if (*ptype == PACKET_OTHERHOST) - fprintf(f, " ptype otherhost"); + ptype = rta_getattr_u16(tb[TCA_SKBEDIT_PTYPE]); + if (ptype == PACKET_HOST) + print_string(PRINT_ANY, "ptype", " ptype %s", "host"); + else if (ptype == PACKET_BROADCAST) + print_string(PRINT_ANY, "ptype", " ptype %s", + "broadcast"); + else if (ptype == PACKET_MULTICAST) + print_string(PRINT_ANY, "ptype", " ptype %s", + "multicast"); + else if (ptype == PACKET_OTHERHOST) + print_string(PRINT_ANY, "ptype", " ptype %s", + "otherhost"); else - fprintf(f, " ptype %d", *ptype); + print_uint(PRINT_ANY, "ptype", " ptype %u", ptype); } print_action_control(f, " ", p->action, ""); - fprintf(f, "\n\t index %u ref %d bind %d", - p->index, p->refcnt, p->bindcnt); + print_string(PRINT_FP, NULL, "%s", _SL_); + print_uint(PRINT_ANY, "index", "\t index %u", p->index); + print_int(PRINT_ANY, "ref", " ref %d", p->refcnt); + print_int(PRINT_ANY, "bind", " bind %d", p->bindcnt); if (show_stats) { if (tb[TCA_SKBEDIT_TM]) { @@ -225,7 +230,7 @@ static int print_skbedit(struct action_util *au, FILE *f, struct rtattr *arg) } } - fprintf(f, "\n "); + print_string(PRINT_FP, NULL, "%s", _SL_); return 0; } From 8744c5d3388e31f7bfc8da979af02bf18e62f55f Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Fri, 13 Apr 2018 17:40:05 -0400 Subject: [PATCH 05/28] tc: jsonify ife action Signed-off-by: Roman Mashak Signed-off-by: David Ahern --- tc/m_ife.c | 54 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/tc/m_ife.c b/tc/m_ife.c index d7e61703..15d09a16 100644 --- a/tc/m_ife.c +++ b/tc/m_ife.c @@ -240,22 +240,24 @@ static int print_ife(struct action_util *au, FILE *f, struct rtattr *arg) parse_rtattr_nested(tb, TCA_IFE_MAX, arg); if (tb[TCA_IFE_PARMS] == NULL) { - fprintf(f, "[NULL ife parameters]"); + print_string(PRINT_FP, NULL, "%s", "[NULL ife parameters]"); return -1; } p = RTA_DATA(tb[TCA_IFE_PARMS]); - fprintf(f, "ife %s ", p->flags & IFE_ENCODE ? "encode" : "decode"); + print_string(PRINT_ANY, "kind", "%s ", "ife"); + print_string(PRINT_ANY, "mode", "%s", + p->flags & IFE_ENCODE ? "encode" : "decode"); print_action_control(f, "action ", p->action, " "); if (tb[TCA_IFE_TYPE]) { ife_type = rta_getattr_u16(tb[TCA_IFE_TYPE]); has_optional = 1; - fprintf(f, "type 0x%X ", ife_type); + print_0xhex(PRINT_ANY, "type", "type 0x%X ", ife_type); } if (has_optional) - fprintf(f, "\n\t "); + print_string(PRINT_FP, NULL, "%s\t", _SL_); if (tb[TCA_IFE_METALST]) { struct rtattr *metalist[IFE_META_MAX + 1]; @@ -268,9 +270,11 @@ static int print_ife(struct action_util *au, FILE *f, struct rtattr *arg) len = RTA_PAYLOAD(metalist[IFE_META_SKBMARK]); if (len) { mmark = rta_getattr_u32(metalist[IFE_META_SKBMARK]); - fprintf(f, "use mark %u ", mmark); + print_uint(PRINT_ANY, "mark", "use mark %u ", + mmark); } else - fprintf(f, "allow mark "); + print_string(PRINT_ANY, "mark", "%s mark ", + "allow"); } if (metalist[IFE_META_TCINDEX]) { @@ -278,41 +282,47 @@ static int print_ife(struct action_util *au, FILE *f, struct rtattr *arg) if (len) { mtcindex = rta_getattr_u16(metalist[IFE_META_TCINDEX]); - fprintf(f, "use tcindex %d ", mtcindex); + print_uint(PRINT_ANY, "tcindex", + "use tcindex %u ", mtcindex); } else - fprintf(f, "allow tcindex "); + print_string(PRINT_ANY, "tcindex", + "%s tcindex ", "allow"); } if (metalist[IFE_META_PRIO]) { len = RTA_PAYLOAD(metalist[IFE_META_PRIO]); if (len) { mprio = rta_getattr_u32(metalist[IFE_META_PRIO]); - fprintf(f, "use prio %u ", mprio); + print_uint(PRINT_ANY, "prio", "use prio %u ", + mprio); } else - fprintf(f, "allow prio "); + print_string(PRINT_ANY, "prio", "%s prio ", + "allow"); } } if (tb[TCA_IFE_DMAC]) { has_optional = 1; - fprintf(f, "dst %s ", - ll_addr_n2a(RTA_DATA(tb[TCA_IFE_DMAC]), - RTA_PAYLOAD(tb[TCA_IFE_DMAC]), 0, b2, - sizeof(b2))); - + print_string(PRINT_ANY, "dst", "dst %s ", + ll_addr_n2a(RTA_DATA(tb[TCA_IFE_DMAC]), + RTA_PAYLOAD(tb[TCA_IFE_DMAC]), 0, b2, + sizeof(b2))); } if (tb[TCA_IFE_SMAC]) { has_optional = 1; - fprintf(f, "src %s ", - ll_addr_n2a(RTA_DATA(tb[TCA_IFE_SMAC]), - RTA_PAYLOAD(tb[TCA_IFE_SMAC]), 0, b2, - sizeof(b2))); + print_string(PRINT_ANY, "src", "src %s ", + ll_addr_n2a(RTA_DATA(tb[TCA_IFE_SMAC]), + RTA_PAYLOAD(tb[TCA_IFE_SMAC]), 0, b2, + sizeof(b2))); } - fprintf(f, "\n\t index %u ref %d bind %d", p->index, p->refcnt, - p->bindcnt); + print_string(PRINT_FP, NULL, "%s", _SL_); + print_uint(PRINT_ANY, "index", "\t index %u", p->index); + print_int(PRINT_ANY, "ref", " ref %d", p->refcnt); + print_int(PRINT_ANY, "bind", " bind %d", p->bindcnt); + if (show_stats) { if (tb[TCA_IFE_TM]) { struct tcf_t *tm = RTA_DATA(tb[TCA_IFE_TM]); @@ -321,7 +331,7 @@ static int print_ife(struct action_util *au, FILE *f, struct rtattr *arg) } } - fprintf(f, "\n"); + print_string(PRINT_FP, NULL, "%s", _SL_); return 0; } From 075bf62a705be8c26c1058066141d9a391b8deb3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 19 Apr 2018 11:10:27 -0700 Subject: [PATCH 06/28] Update kernel headers Update kernel headers to commit 292eba02dbb4 ("net-next/hinic: add arm64 support") Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 62 +++++++++++++++++++++++++++++++++++- include/uapi/linux/if_link.h | 1 + include/uapi/linux/sctp.h | 1 + include/uapi/linux/tcp.h | 5 +++ include/uapi/linux/tipc.h | 59 +++++++++++++++++++--------------- 5 files changed, 102 insertions(+), 26 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 24bd85fc..02b96cba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -94,6 +94,7 @@ enum bpf_cmd { BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, BPF_PROG_QUERY, + BPF_RAW_TRACEPOINT_OPEN, }; enum bpf_map_type { @@ -134,6 +135,8 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, + BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, }; enum bpf_attach_type { @@ -145,6 +148,12 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -294,6 +303,11 @@ union bpf_attr { __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -344,6 +358,11 @@ union bpf_attr { __aligned_u64 prog_ids; __u32 prog_cnt; } query; + + struct { + __u64 name; + __u32 prog_fd; + } raw_tracepoint; } __attribute__((aligned(8))); /* BPF helper function descriptions: @@ -729,6 +748,13 @@ union bpf_attr { * @flags: reserved for future use * Return: SK_PASS * + * int bpf_bind(ctx, addr, addr_len) + * Bind socket to address. Only binding to IP is supported, no port can be + * set in addr. + * @ctx: pointer to context of type bpf_sock_addr + * @addr: pointer to struct sockaddr to bind socket to + * @addr_len: length of sockaddr structure + * Return: 0 on success or negative error code */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -794,7 +820,8 @@ union bpf_attr { FN(msg_redirect_map), \ FN(msg_apply_bytes), \ FN(msg_cork_bytes), \ - FN(msg_pull_data), + FN(msg_pull_data), \ + FN(bind), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -923,6 +950,15 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; + __u32 src_ip4; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_ip6[4]; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_port; /* Allows 4-byte read. + * Stored in host byte order + */ }; #define XDP_PACKET_HEADROOM 256 @@ -998,6 +1034,26 @@ struct bpf_map_info { __u64 netns_ino; } __attribute__((aligned(8))); +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 4-byte read and write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ +}; + /* User bpf_sock_ops struct to access socket values and specify request ops * and their replies. * Some of this fields are in network (bigendian) byte order and may need @@ -1152,4 +1208,8 @@ struct bpf_cgroup_dev_ctx { __u32 minor; }; +struct bpf_raw_tracepoint_args { + __u64 args[0]; +}; + #endif /* __LINUX_BPF_H__ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 23cd82f3..9318ecd0 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -514,6 +514,7 @@ enum { IFLA_VXLAN_COLLECT_METADATA, IFLA_VXLAN_LABEL, IFLA_VXLAN_GPE, + IFLA_VXLAN_TTL_INHERIT, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 30076fd1..2d95ddc1 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -127,6 +127,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_STREAM_SCHEDULER 123 #define SCTP_STREAM_SCHEDULER_VALUE 124 #define SCTP_INTERLEAVING_SUPPORTED 125 +#define SCTP_SENDMSG_CONNECT 126 /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 5e1e6f3f..fc0e2650 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -224,6 +224,9 @@ struct tcp_info { __u64 tcpi_busy_time; /* Time (usec) busy sending data */ __u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */ __u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ + + __u32 tcpi_delivered; + __u32 tcpi_delivered_ce; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -244,6 +247,8 @@ enum { TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ TCP_NLA_CA_STATE, /* ca_state of socket */ TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ + TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ + TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ }; diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 1d92ccb0..88a7251b 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -45,33 +45,33 @@ * TIPC addressing primitives */ -struct tipc_portid { +struct tipc_socket_addr { __u32 ref; __u32 node; }; -struct tipc_name { +struct tipc_service_addr { __u32 type; __u32 instance; }; -struct tipc_name_seq { +struct tipc_service_range { __u32 type; __u32 lower; __u32 upper; }; /* - * Application-accessible port name types + * Application-accessible service types */ -#define TIPC_CFG_SRV 0 /* configuration service name type */ -#define TIPC_TOP_SRV 1 /* topology service name type */ -#define TIPC_LINK_STATE 2 /* link state name type */ -#define TIPC_RESERVED_TYPES 64 /* lowest user-publishable name type */ +#define TIPC_NODE_STATE 0 /* node state service type */ +#define TIPC_TOP_SRV 1 /* topology server service type */ +#define TIPC_LINK_STATE 2 /* link state service type */ +#define TIPC_RESERVED_TYPES 64 /* lowest user-allowed service type */ /* - * Publication scopes when binding port names and port name sequences + * Publication scopes when binding service / service range */ enum tipc_scope { TIPC_CLUSTER_SCOPE = 2, /* 0 can also be used */ @@ -108,28 +108,28 @@ enum tipc_scope { * TIPC topology subscription service definitions */ -#define TIPC_SUB_PORTS 0x01 /* filter for port availability */ -#define TIPC_SUB_SERVICE 0x02 /* filter for service availability */ -#define TIPC_SUB_CANCEL 0x04 /* cancel a subscription */ +#define TIPC_SUB_PORTS 0x01 /* filter: evt at each match */ +#define TIPC_SUB_SERVICE 0x02 /* filter: evt at first up/last down */ +#define TIPC_SUB_CANCEL 0x04 /* filter: cancel a subscription */ #define TIPC_WAIT_FOREVER (~0) /* timeout for permanent subscription */ struct tipc_subscr { - struct tipc_name_seq seq; /* name sequence of interest */ + struct tipc_service_range seq; /* range of interest */ __u32 timeout; /* subscription duration (in ms) */ __u32 filter; /* bitmask of filter options */ char usr_handle[8]; /* available for subscriber use */ }; #define TIPC_PUBLISHED 1 /* publication event */ -#define TIPC_WITHDRAWN 2 /* withdraw event */ +#define TIPC_WITHDRAWN 2 /* withdrawal event */ #define TIPC_SUBSCR_TIMEOUT 3 /* subscription timeout event */ struct tipc_event { __u32 event; /* event type */ - __u32 found_lower; /* matching name seq instances */ - __u32 found_upper; /* " " " " */ - struct tipc_portid port; /* associated port */ + __u32 found_lower; /* matching range */ + __u32 found_upper; /* " " */ + struct tipc_socket_addr port; /* associated socket */ struct tipc_subscr s; /* associated subscription */ }; @@ -149,20 +149,20 @@ struct tipc_event { #define SOL_TIPC 271 #endif -#define TIPC_ADDR_NAMESEQ 1 -#define TIPC_ADDR_MCAST 1 -#define TIPC_ADDR_NAME 2 -#define TIPC_ADDR_ID 3 +#define TIPC_ADDR_MCAST 1 +#define TIPC_SERVICE_RANGE 1 +#define TIPC_SERVICE_ADDR 2 +#define TIPC_SOCKET_ADDR 3 struct sockaddr_tipc { unsigned short family; unsigned char addrtype; signed char scope; union { - struct tipc_portid id; - struct tipc_name_seq nameseq; + struct tipc_socket_addr id; + struct tipc_service_range nameseq; struct { - struct tipc_name name; + struct tipc_service_addr name; __u32 domain; } name; } addr; @@ -216,7 +216,7 @@ struct tipc_group_req { #define TIPC_MAX_MEDIA_NAME 16 #define TIPC_MAX_IF_NAME 16 #define TIPC_MAX_BEARER_NAME 32 -#define TIPC_MAX_LINK_NAME 60 +#define TIPC_MAX_LINK_NAME 68 #define SIOCGETLINKNAME SIOCPROTOPRIVATE @@ -230,8 +230,13 @@ struct tipc_sioc_ln_req { /* The macros and functions below are deprecated: */ +#define TIPC_CFG_SRV 0 #define TIPC_ZONE_SCOPE 1 +#define TIPC_ADDR_NAMESEQ 1 +#define TIPC_ADDR_NAME 2 +#define TIPC_ADDR_ID 3 + #define TIPC_NODE_BITS 12 #define TIPC_CLUSTER_BITS 12 #define TIPC_ZONE_BITS 8 @@ -250,6 +255,10 @@ struct tipc_sioc_ln_req { #define TIPC_ZONE_CLUSTER_MASK (TIPC_ZONE_MASK | TIPC_CLUSTER_MASK) +#define tipc_portid tipc_socket_addr +#define tipc_name tipc_service_addr +#define tipc_name_seq tipc_service_range + static __inline__ __u32 tipc_addr(unsigned int zone, unsigned int cluster, unsigned int node) From 5b1c363c7b3cd489583285f2ae75400b0494462c Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 18 Apr 2018 13:05:48 +0800 Subject: [PATCH 07/28] vxlan: fix ttl inherit behavior Like kernel net-next commit 72f6d71e491e6 ("vxlan: add ttl inherit support"), vxlan ttl inherit should means inherit the inner protocol's ttl value. But currently when we add vxlan with "ttl inherit", we only set ttl 0, which is actually use whatever default value instead of inherit the inner protocol's ttl value. To make a difference with ttl inherit and ttl == 0, we add an attribute IFLA_VXLAN_TTL_INHERIT when "ttl inherit" specified. And use "ttl auto" to means "use whatever default value", the same behavior with ttl == 0. Reported-by: Jianlin Shi Suggested-by: Jiri Benc Signed-off-by: Hangbin Liu Signed-off-by: David Ahern --- ip/iplink_vxlan.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c index be9f35e4..d4d793b6 100644 --- a/ip/iplink_vxlan.c +++ b/ip/iplink_vxlan.c @@ -143,14 +143,18 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, NEXT_ARG(); check_duparg(&attrs, IFLA_VXLAN_TTL, "ttl", *argv); - if (strcmp(*argv, "inherit") != 0) { + if (strcmp(*argv, "inherit") == 0) { + addattr_l(n, 1024, IFLA_VXLAN_TTL_INHERIT, NULL, 0); + } else if (strcmp(*argv, "auto") == 0) { + addattr8(n, 1024, IFLA_VXLAN_TTL, ttl); + } else { if (get_unsigned(&uval, *argv, 0)) invarg("invalid TTL", *argv); if (uval > 255) invarg("TTL must be <= 255", *argv); ttl = uval; + addattr8(n, 1024, IFLA_VXLAN_TTL, ttl); } - addattr8(n, 1024, IFLA_VXLAN_TTL, ttl); } else if (!matches(*argv, "tos") || !matches(*argv, "dsfield")) { __u32 uval; From d21c028cf74147360c530a4c53063bbe677dbe73 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 20 Apr 2018 10:31:59 +0200 Subject: [PATCH 08/28] man: ip link: document GRE tunnels GRE tunnels are currently only documented together with IPIP and SIT tunnels, but they actually have very different configuration options. Let's separate them. Signed-off-by: Sabrina Dubroca Signed-off-by: David Ahern --- man/man8/ip-link.8.in | 152 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 148 insertions(+), 4 deletions(-) diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 5dee9fcd..77ab8a3b 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -693,13 +693,13 @@ tunnel. .in -8 .TP -GRE, IPIP, SIT Type Support -For a link of types -.I GRE/IPIP/SIT +IPIP, SIT Type Support +For a link of type +.IR IPIP or SIT the following additional arguments are supported: .BI "ip link add " DEVICE -.BR type " { " gre " | " ipip " | " sit " }" +.BR type " { " ipip " | " sit " }" .BI " remote " ADDR " local " ADDR [ .BR encap " { " fou " | " gue " | " none " }" @@ -764,6 +764,150 @@ IPv6-Over-IPv4 is not supported for IPIP. - make this tunnel externally controlled .RB "(e.g. " "ip route encap" ). +.in -8 +.TP +GRE Type Support +For a link of type +.IR GRE " or " GRETAP +the following additional arguments are supported: + +.BI "ip link add " DEVICE +.BR type " { " gre " | " gretap " }" +.BI " remote " ADDR " local " ADDR +[ +.RB [ i | o ] seq +] [ +.RB [ i | o ] key +.I KEY +] [ +.RB [ i | o ] csum +] [ +.BI ttl " TTL " +] [ +.BI tos " TOS " +] [ +.RB [ no ] pmtudisc +] [ +.RB [ no ] ignore-df +] [ +.BI dev " PHYS_DEV " +] [ +.BR encap " { " fou " | " gue " | " none " }" +] [ +.BR encap-sport " { " \fIPORT " | " auto " }" +] [ +.BI "encap-dport " PORT +] [ +.RB [ no ] encap-csum +] [ +.RB [ no ] encap-remcsum +] [ +.BR external +] + +.in +8 +.sp +.BI remote " ADDR " +- specifies the remote address of the tunnel. + +.sp +.BI local " ADDR " +- specifies the fixed local address for tunneled packets. +It must be an address on another interface on this host. + +.sp +.RB [ i | o ] seq +- serialize packets. +The +.B oseq +flag enables sequencing of outgoing packets. +The +.B iseq +flag requires that all input packets are serialized. + +.sp +.RB [ i | o ] key +.I KEY +- use keyed GRE with key +.IR KEY ". "KEY +is either a number or an IPv4 address-like dotted quad. +The +.B key +parameter specifies the same key to use in both directions. +The +.BR ikey " and " okey +parameters specify different keys for input and output. + +.sp +.RB [ i | o ] csum +- generate/require checksums for tunneled packets. +The +.B ocsum +flag calculates checksums for outgoing packets. +The +.B icsum +flag requires that all input packets have the correct +checksum. The +.B csum +flag is equivalent to the combination +.B "icsum ocsum" . + +.sp +.BI ttl " TTL" +- specifies the TTL value to use in outgoing packets. + +.sp +.BI tos " TOS" +- specifies the TOS value to use in outgoing packets. + +.sp +.RB [ no ] pmtudisc +- enables/disables Path MTU Discovery on this tunnel. +It is enabled by default. Note that a fixed ttl is incompatible +with this option: tunneling with a fixed ttl always makes pmtu +discovery. + +.sp +.RB [ no ] ignore-df +- enables/disables IPv4 DF suppression on this tunnel. +Normally datagrams that exceed the MTU will be fragmented; the presence +of the DF flag inhibits this, resulting instead in an ICMP Unreachable +(Fragmentation Required) message. Enabling this attribute casues the +DF flag to be ignored. + +.sp +.BI dev " PHYS_DEV" +- specifies the physical device to use for tunnel endpoint communication. + +.sp +.BR encap " { " fou " | " gue " | " none " }" +- specifies type of secondary UDP encapsulation. "fou" indicates +Foo-Over-UDP, "gue" indicates Generic UDP Encapsulation. + +.sp +.BR encap-sport " { " \fIPORT " | " auto " }" +- specifies the source port in UDP encapsulation. +.IR PORT +indicates the port by number, "auto" +indicates that the port number should be chosen automatically +(the kernel picks a flow based on the flow hash of the +encapsulated packet). + +.sp +.RB [ no ] encap-csum +- specifies if UDP checksums are enabled in the secondary +encapsulation. + +.sp +.RB [ no ] encap-remcsum +- specifies if Remote Checksum Offload is enabled. This is only +applicable for Generic UDP Encapsulation. + +.sp +.BR external +- make this tunnel externally controlled +.RB "(e.g. " "ip route encap" ). + .in -8 .TP From 7f520601f59ee35da2fc48b3f1b39ed2b80c9efa Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 20 Apr 2018 10:32:00 +0200 Subject: [PATCH 09/28] gre/gre6: allow clearing {,i,o}{key,seq,csum} flags Currently, iproute allows setting those flags, but it's impossible to clear them, since their current value is fetched from the kernel and then we OR in the additional flags passed on the command line. Add no* variants to allow clearing them. Signed-off-by: Sabrina Dubroca Signed-off-by: David Ahern --- ip/link_gre.c | 30 +++++++++++++++++++++++++++--- ip/link_gre6.c | 30 +++++++++++++++++++++++++++--- man/man8/ip-link.8.in | 27 ++++++++++++++++++--------- 3 files changed, 72 insertions(+), 15 deletions(-) diff --git a/ip/link_gre.c b/ip/link_gre.c index bc1cee8f..ede761b2 100644 --- a/ip/link_gre.c +++ b/ip/link_gre.c @@ -31,9 +31,9 @@ static void gre_print_help(struct link_util *lu, int argc, char **argv, FILE *f) ); fprintf(f, " [ local ADDR ]\n" - " [ [i|o]seq ]\n" - " [ [i|o]key KEY ]\n" - " [ [i|o]csum ]\n" + " [ [no][i|o]seq ]\n" + " [ [i|o]key KEY | no[i|o]key ]\n" + " [ [no][i|o]csum ]\n" " [ ttl TTL ]\n" " [ tos TOS ]\n" " [ [no]pmtudisc ]\n" @@ -210,28 +210,52 @@ get_failed: iflags |= GRE_KEY; oflags |= GRE_KEY; ikey = okey = tnl_parse_key("key", *argv); + } else if (!matches(*argv, "nokey")) { + iflags &= ~GRE_KEY; + oflags &= ~GRE_KEY; + ikey = okey = 0; } else if (!matches(*argv, "ikey")) { NEXT_ARG(); iflags |= GRE_KEY; ikey = tnl_parse_key("ikey", *argv); + } else if (!matches(*argv, "noikey")) { + iflags &= ~GRE_KEY; + ikey = 0; } else if (!matches(*argv, "okey")) { NEXT_ARG(); oflags |= GRE_KEY; okey = tnl_parse_key("okey", *argv); + } else if (!matches(*argv, "nookey")) { + oflags &= ~GRE_KEY; + okey = 0; } else if (!matches(*argv, "seq")) { iflags |= GRE_SEQ; oflags |= GRE_SEQ; + } else if (!matches(*argv, "noseq")) { + iflags &= ~GRE_SEQ; + oflags &= ~GRE_SEQ; } else if (!matches(*argv, "iseq")) { iflags |= GRE_SEQ; + } else if (!matches(*argv, "noiseq")) { + iflags &= ~GRE_SEQ; } else if (!matches(*argv, "oseq")) { oflags |= GRE_SEQ; + } else if (!matches(*argv, "nooseq")) { + oflags &= ~GRE_SEQ; } else if (!matches(*argv, "csum")) { iflags |= GRE_CSUM; oflags |= GRE_CSUM; + } else if (!matches(*argv, "nocsum")) { + iflags &= ~GRE_CSUM; + oflags &= ~GRE_CSUM; } else if (!matches(*argv, "icsum")) { iflags |= GRE_CSUM; + } else if (!matches(*argv, "noicsum")) { + iflags &= ~GRE_CSUM; } else if (!matches(*argv, "ocsum")) { oflags |= GRE_CSUM; + } else if (!matches(*argv, "noocsum")) { + oflags &= ~GRE_CSUM; } else if (!matches(*argv, "nopmtudisc")) { pmtudisc = 0; } else if (!matches(*argv, "pmtudisc")) { diff --git a/ip/link_gre6.c b/ip/link_gre6.c index a6fe0b73..181b2eae 100644 --- a/ip/link_gre6.c +++ b/ip/link_gre6.c @@ -38,9 +38,9 @@ static void gre_print_help(struct link_util *lu, int argc, char **argv, FILE *f) ); fprintf(f, " [ local ADDR ]\n" - " [ [i|o]seq ]\n" - " [ [i|o]key KEY ]\n" - " [ [i|o]csum ]\n" + " [ [no][i|o]seq ]\n" + " [ [i|o]key KEY | no[i|o]key ]\n" + " [ [no][i|o]csum ]\n" " [ hoplimit TTL ]\n" " [ encaplimit ELIM ]\n" " [ tclass TCLASS ]\n" @@ -220,28 +220,52 @@ get_failed: iflags |= GRE_KEY; oflags |= GRE_KEY; ikey = okey = tnl_parse_key("key", *argv); + } else if (!matches(*argv, "nokey")) { + iflags &= ~GRE_KEY; + oflags &= ~GRE_KEY; + ikey = okey = 0; } else if (!matches(*argv, "ikey")) { NEXT_ARG(); iflags |= GRE_KEY; ikey = tnl_parse_key("ikey", *argv); + } else if (!matches(*argv, "noikey")) { + iflags &= ~GRE_KEY; + ikey = 0; } else if (!matches(*argv, "okey")) { NEXT_ARG(); oflags |= GRE_KEY; okey = tnl_parse_key("okey", *argv); + } else if (!matches(*argv, "nookey")) { + oflags &= ~GRE_KEY; + okey = 0; } else if (!matches(*argv, "seq")) { iflags |= GRE_SEQ; oflags |= GRE_SEQ; + } else if (!matches(*argv, "noseq")) { + iflags &= ~GRE_SEQ; + oflags &= ~GRE_SEQ; } else if (!matches(*argv, "iseq")) { iflags |= GRE_SEQ; + } else if (!matches(*argv, "noiseq")) { + iflags &= ~GRE_SEQ; } else if (!matches(*argv, "oseq")) { oflags |= GRE_SEQ; + } else if (!matches(*argv, "nooseq")) { + oflags &= ~GRE_SEQ; } else if (!matches(*argv, "csum")) { iflags |= GRE_CSUM; oflags |= GRE_CSUM; + } else if (!matches(*argv, "nocsum")) { + iflags &= ~GRE_CSUM; + oflags &= ~GRE_CSUM; } else if (!matches(*argv, "icsum")) { iflags |= GRE_CSUM; + } else if (!matches(*argv, "noicsum")) { + iflags &= ~GRE_CSUM; } else if (!matches(*argv, "ocsum")) { oflags |= GRE_CSUM; + } else if (!matches(*argv, "noocsum")) { + oflags &= ~GRE_CSUM; } else if (!matches(*argv, "remote")) { NEXT_ARG(); get_addr(&daddr, *argv, AF_INET6); diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 77ab8a3b..83ef3cae 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -775,12 +775,14 @@ the following additional arguments are supported: .BR type " { " gre " | " gretap " }" .BI " remote " ADDR " local " ADDR [ -.RB [ i | o ] seq +.RB [ no ] "" [ i | o ] seq ] [ .RB [ i | o ] key .I KEY +| +.BR no [ i | o ] key ] [ -.RB [ i | o ] csum +.RB [ no ] "" [ i | o ] csum ] [ .BI ttl " TTL " ] [ @@ -816,7 +818,7 @@ the following additional arguments are supported: It must be an address on another interface on this host. .sp -.RB [ i | o ] seq +.RB [ no ] "" [ i | o ] seq - serialize packets. The .B oseq @@ -828,6 +830,8 @@ flag requires that all input packets are serialized. .sp .RB [ i | o ] key .I KEY +| +.BR no [ i | o ] key - use keyed GRE with key .IR KEY ". "KEY is either a number or an IPv4 address-like dotted quad. @@ -839,7 +843,7 @@ The parameters specify different keys for input and output. .sp -.RB [ i | o ] csum +.RB [ no ] "" [ i | o ] csum - generate/require checksums for tunneled packets. The .B ocsum @@ -920,12 +924,14 @@ the following additional arguments are supported: .BR type " { " ip6gre " | " ip6gretap " }" .BI remote " ADDR " local " ADDR" [ -.RB [ i | o ] seq +.RB [ no ] "" [ i | o ] seq ] [ .RB [ i | o ] key .I KEY +| +.BR no [ i | o ] key ] [ -.RB [ i | o ] csum +.RB [ no ] "" [ i | o ] csum ] [ .BI hoplimit " TTL " ] [ @@ -955,7 +961,7 @@ the following additional arguments are supported: It must be an address on another interface on this host. .sp -.RB [ i | o ] seq +.RB [ no ] "" [ i | o ] seq - serialize packets. The .B oseq @@ -965,7 +971,10 @@ The flag requires that all input packets are serialized. .sp -.RB [ i | o ] key " \fIKEY" +.RB [ i | o ] key +.I KEY +| +.BR no [ i | o ] key - use keyed GRE with key .IR KEY ". "KEY is either a number or an IPv4 address-like dotted quad. @@ -977,7 +986,7 @@ The parameters specify different keys for input and output. .sp -.RB [ i | o ] csum +.RB [ no ] "" [ i | o ] csum - generate/require checksums for tunneled packets. The .B ocsum From 8f01001abc01df5df343ec27b451e0adf83e8888 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 24 Apr 2018 10:40:17 +0800 Subject: [PATCH 10/28] vxlan: add ttl auto in help message Signed-off-by: Hangbin Liu Signed-off-by: David Ahern --- ip/iplink_vxlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c index d4d793b6..2bc253fc 100644 --- a/ip/iplink_vxlan.c +++ b/ip/iplink_vxlan.c @@ -51,7 +51,7 @@ static void print_explain(FILE *f) "Where: VNI := 0-16777215\n" " ADDR := { IP_ADDRESS | any }\n" " TOS := { NUMBER | inherit }\n" - " TTL := { 1..255 | inherit }\n" + " TTL := { 1..255 | auto | inherit }\n" " LABEL := 0-1048575\n" ); } From 0c0394ff83567e263491dc4aeccff5de9fbdcdcc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 May 2018 17:37:51 -0700 Subject: [PATCH 11/28] bpf: don't offload perf array maps Perf arrays are handled specially by the kernel, don't request offload even when used by an offloaded program. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Daniel Borkmann Signed-off-by: David Ahern --- lib/bpf.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/bpf.c b/lib/bpf.c index d9a406bf..4e26c0df 100644 --- a/lib/bpf.c +++ b/lib/bpf.c @@ -97,6 +97,11 @@ static const struct bpf_prog_meta __bpf_prog_meta[] = { }, }; +static bool bpf_map_offload_neutral(enum bpf_map_type type) +{ + return type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; +} + static const char *bpf_prog_to_subdir(enum bpf_prog_type type) { assert(type < ARRAY_SIZE(__bpf_prog_meta) && @@ -1594,7 +1599,7 @@ static int bpf_map_attach(const char *name, struct bpf_elf_ctx *ctx, const struct bpf_elf_map *map, struct bpf_map_ext *ext, int *have_map_in_map) { - int fd, ret, map_inner_fd = 0; + int fd, ifindex, ret, map_inner_fd = 0; fd = bpf_probe_pinned(name, ctx, map->pinning); if (fd > 0) { @@ -1631,10 +1636,10 @@ static int bpf_map_attach(const char *name, struct bpf_elf_ctx *ctx, } } + ifindex = bpf_map_offload_neutral(map->type) ? 0 : ctx->ifindex; errno = 0; fd = bpf_map_create(map->type, map->size_key, map->size_value, - map->max_elem, map->flags, map_inner_fd, - ctx->ifindex); + map->max_elem, map->flags, map_inner_fd, ifindex); if (fd < 0 || ctx->verbose) { bpf_map_report(fd, name, map, ctx, map_inner_fd); From fd95ec0e8edd5dec5debc04212d31904ce4a987e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:52:52 -0700 Subject: [PATCH 12/28] Update kernel headers Update kernel headers to commit 53a7bdfb2a27 ("dt-bindings: dsa: Remove unnecessary #address/#size-cells") Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 1756 +++++++++++++++++++++++------ include/uapi/linux/tcp.h | 11 + include/uapi/linux/tipc.h | 12 +- include/uapi/linux/tipc_netlink.h | 1 + 4 files changed, 1447 insertions(+), 333 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 02b96cba..4da29ae0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -95,6 +95,7 @@ enum bpf_cmd { BPF_OBJ_GET_INFO_BY_FD, BPF_PROG_QUERY, BPF_RAW_TRACEPOINT_OPEN, + BPF_BTF_LOAD, }; enum bpf_map_type { @@ -115,6 +116,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, }; enum bpf_prog_type { @@ -279,6 +281,9 @@ union bpf_attr { */ char map_name[BPF_OBJ_NAME_LEN]; __u32 map_ifindex; /* ifindex of netdev to create on */ + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_id; /* BTF type_id of the key */ + __u32 btf_value_id; /* BTF type_id of the value */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -363,398 +368,1464 @@ union bpf_attr { __u64 name; __u32 prog_fd; } raw_tracepoint; + + struct { /* anonymous struct for BPF_BTF_LOAD */ + __aligned_u64 btf; + __aligned_u64 btf_log_buf; + __u32 btf_size; + __u32 btf_log_size; + __u32 btf_log_level; + }; } __attribute__((aligned(8))); -/* BPF helper function descriptions: +/* The description below is an attempt at providing documentation to eBPF + * developers about the multiple available eBPF helper functions. It can be + * parsed and used to produce a manual page. The workflow is the following, + * and requires the rst2man utility: * - * void *bpf_map_lookup_elem(&map, &key) - * Return: Map value or NULL + * $ ./scripts/bpf_helpers_doc.py \ + * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst + * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 + * $ man /tmp/bpf-helpers.7 * - * int bpf_map_update_elem(&map, &key, &value, flags) - * Return: 0 on success or negative error + * Note that in order to produce this external documentation, some RST + * formatting is used in the descriptions to get "bold" and "italics" in + * manual pages. Also note that the few trailing white spaces are + * intentional, removing them would break paragraphs for rst2man. * - * int bpf_map_delete_elem(&map, &key) - * Return: 0 on success or negative error + * Start of BPF helper function descriptions: * - * int bpf_probe_read(void *dst, int size, void *src) - * Return: 0 on success or negative error + * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) + * Description + * Perform a lookup in *map* for an entry associated to *key*. + * Return + * Map value associated to *key*, or **NULL** if no entry was + * found. + * + * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * Description + * Add or update the value of the entry associated to *key* in + * *map* with *value*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * Flag value **BPF_NOEXIST** cannot be used for maps of types + * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all + * elements always exist), the helper would return an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_delete_elem(struct bpf_map *map, const void *key) + * Description + * Delete entry with *key* from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read(void *dst, u32 size, const void *src) + * Description + * For tracing programs, safely attempt to read *size* bytes from + * address *src* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. * * u64 bpf_ktime_get_ns(void) - * Return: current ktime + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Return + * Current *ktime*. * - * int bpf_trace_printk(const char *fmt, int fmt_size, ...) - * Return: length of buffer written or negative error + * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * Description + * This helper is a "printk()-like" facility for debugging. It + * prints a message defined by format *fmt* (of size *fmt_size*) + * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if + * available. It can take up to three additional **u64** + * arguments (as an eBPF helpers, the total number of arguments is + * limited to five). * - * u32 bpf_prandom_u32(void) - * Return: random value + * Each time the helper is called, it appends a line to the trace. + * The format of the trace is customizable, and the exact output + * one will get depends on the options set in + * *\/sys/kernel/debug/tracing/trace_options* (see also the + * *README* file under the same directory). However, it usually + * defaults to something like: * - * u32 bpf_raw_smp_processor_id(void) - * Return: SMP processor ID + * :: * - * int bpf_skb_store_bytes(skb, offset, from, len, flags) - * store bytes into packet - * @skb: pointer to skb - * @offset: offset within packet from skb->mac_header - * @from: pointer where to copy bytes from - * @len: number of bytes to store into packet - * @flags: bit 0 - if true, recompute skb->csum - * other bits - reserved - * Return: 0 on success or negative error + * telnet-470 [001] .N.. 419421.045894: 0x00000001: * - * int bpf_l3_csum_replace(skb, offset, from, to, flags) - * recompute IP checksum - * @skb: pointer to skb - * @offset: offset within packet where IP checksum is located - * @from: old value of header field - * @to: new value of header field - * @flags: bits 0-3 - size of header field - * other bits - reserved - * Return: 0 on success or negative error + * In the above: * - * int bpf_l4_csum_replace(skb, offset, from, to, flags) - * recompute TCP/UDP checksum - * @skb: pointer to skb - * @offset: offset within packet where TCP/UDP checksum is located - * @from: old value of header field - * @to: new value of header field - * @flags: bits 0-3 - size of header field - * bit 4 - is pseudo header - * other bits - reserved - * Return: 0 on success or negative error + * * ``telnet`` is the name of the current task. + * * ``470`` is the PID of the current task. + * * ``001`` is the CPU number on which the task is + * running. + * * In ``.N..``, each character refers to a set of + * options (whether irqs are enabled, scheduling + * options, whether hard/softirqs are running, level of + * preempt_disabled respectively). **N** means that + * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** + * are set. + * * ``419421.045894`` is a timestamp. + * * ``0x00000001`` is a fake value used by BPF for the + * instruction pointer register. + * * ```` is the message formatted with + * *fmt*. * - * int bpf_tail_call(ctx, prog_array_map, index) - * jump into another BPF program - * @ctx: context pointer passed to next program - * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY - * @index: 32-bit index inside array that selects specific program to run - * Return: 0 on success or negative error + * The conversion specifiers supported by *fmt* are similar, but + * more limited than for printk(). They are **%d**, **%i**, + * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, + * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size + * of field, padding with zeroes, etc.) is available, and the + * helper will return **-EINVAL** (but print nothing) if it + * encounters an unknown specifier. * - * int bpf_clone_redirect(skb, ifindex, flags) - * redirect to another netdev - * @skb: pointer to skb - * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: 0 on success or negative error + * Also, note that **bpf_trace_printk**\ () is slow, and should + * only be used for debugging purposes. For this reason, a notice + * bloc (spanning several lines) is printed to kernel logs and + * states that the helper should not be used "for production use" + * the first time this helper is used (or more precisely, when + * **trace_printk**\ () buffers are allocated). For passing values + * to user space, perf events should be preferred. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * u32 bpf_get_prandom_u32(void) + * Description + * Get a pseudo-random number. + * + * From a security point of view, this helper uses its own + * pseudo-random internal state, and cannot be used to infer the + * seed of other random functions in the kernel. However, it is + * essential to note that the generator used by the helper is not + * cryptographically secure. + * Return + * A random 32-bit unsigned value. + * + * u32 bpf_get_smp_processor_id(void) + * Description + * Get the SMP (symmetric multiprocessing) processor id. Note that + * all programs run with preemption disabled, which means that the + * SMP processor id is stable during all the execution of the + * program. + * Return + * The SMP id of the processor running the program. + * + * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. *flags* are a combination of + * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the + * checksum for the packet after storing the bytes) and + * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ + * **->swhash** and *skb*\ **->l4hash** to 0). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * Description + * Recompute the layer 3 (e.g. IP) checksum for the packet + * associated to *skb*. Computation is incremental, so the helper + * must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored in *size*. + * Alternatively, it is possible to store the difference between + * the previous and the new values of the header field in *to*, by + * setting *from* and *size* to 0. For both methods, *offset* + * indicates the location of the IP checksum within the packet. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * Description + * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the + * packet associated to *skb*. Computation is incremental, so the + * helper must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored on the lowest + * four bits of *flags*. Alternatively, it is possible to store + * the difference between the previous and the new values of the + * header field in *to*, by setting *from* and the four lowest + * bits of *flags* to 0. For both methods, *offset* indicates the + * location of the IP checksum within the packet. In addition to + * the size of the field, *flags* can be added (bitwise OR) actual + * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left + * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and + * for updates resulting in a null checksum the value is set to + * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates + * the checksum is to be computed against a pseudo-header. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * Description + * This special helper is used to trigger a "tail call", or in + * other words, to jump into another eBPF program. The same stack + * frame is used (but values on stack and in registers for the + * caller are not accessible to the callee). This mechanism allows + * for program chaining, either for raising the maximum number of + * available eBPF instructions, or to execute given programs in + * conditional blocks. For security reasons, there is an upper + * limit to the number of successive tail calls that can be + * performed. + * + * Upon call of this helper, the program attempts to jump into a + * program referenced at index *index* in *prog_array_map*, a + * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes + * *ctx*, a pointer to the context. + * + * If the call succeeds, the kernel immediately runs the first + * instruction of the new program. This is not a function call, + * and it never returns to the previous program. If the call + * fails, then the helper has no effect, and the caller continues + * to run its subsequent instructions. A call can fail if the + * destination program for the jump does not exist (i.e. *index* + * is superior to the number of entries in *prog_array_map*), or + * if the maximum number of tail calls has been reached for this + * chain of programs. This limit is defined in the kernel by the + * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), + * which is currently set to 32. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * Description + * Clone and redirect the packet associated to *skb* to another + * net device of index *ifindex*. Both ingress and egress + * interfaces can be used for redirection. The **BPF_F_INGRESS** + * value in *flags* is used to make the distinction (ingress path + * is selected if the flag is present, egress path otherwise). + * This is the only flag supported for now. + * + * In comparison with **bpf_redirect**\ () helper, + * **bpf_clone_redirect**\ () has the associated cost of + * duplicating the packet buffer, but this can be executed out of + * the eBPF program. Conversely, **bpf_redirect**\ () is more + * efficient, but it is handled through an action code where the + * redirection happens only after the eBPF program has returned. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. * * u64 bpf_get_current_pid_tgid(void) - * Return: current->tgid << 32 | current->pid + * Return + * A 64-bit integer containing the current tgid and pid, and + * created as such: + * *current_task*\ **->tgid << 32 \|** + * *current_task*\ **->pid**. * * u64 bpf_get_current_uid_gid(void) - * Return: current_gid << 32 | current_uid + * Return + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. * - * int bpf_get_current_comm(char *buf, int size_of_buf) - * stores current->comm into buf - * Return: 0 on success or negative error + * int bpf_get_current_comm(char *buf, u32 size_of_buf) + * Description + * Copy the **comm** attribute of the current task into *buf* of + * *size_of_buf*. The **comm** attribute contains the name of + * the executable (excluding the path) for the current task. The + * *size_of_buf* must be strictly positive. On success, the + * helper makes sure that the *buf* is NUL-terminated. On failure, + * it is filled with zeroes. + * Return + * 0 on success, or a negative error in case of failure. * - * u32 bpf_get_cgroup_classid(skb) - * retrieve a proc's classid - * @skb: pointer to skb - * Return: classid if != 0 + * u32 bpf_get_cgroup_classid(struct sk_buff *skb) + * Description + * Retrieve the classid for the current task, i.e. for the net_cls + * cgroup to which *skb* belongs. * - * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) - * Return: 0 on success or negative error + * This helper can be used on TC egress path, but not on ingress. * - * int bpf_skb_vlan_pop(skb) - * Return: 0 on success or negative error + * The net_cls cgroup provides an interface to tag network packets + * based on a user-provided identifier for all traffic coming from + * the tasks belonging to the related cgroup. See also the related + * kernel documentation, available from the Linux sources in file + * *Documentation/cgroup-v1/net_cls.txt*. * - * int bpf_skb_get_tunnel_key(skb, key, size, flags) - * int bpf_skb_set_tunnel_key(skb, key, size, flags) - * retrieve or populate tunnel metadata - * @skb: pointer to skb - * @key: pointer to 'struct bpf_tunnel_key' - * @size: size of 'struct bpf_tunnel_key' - * @flags: room for future extensions - * Return: 0 on success or negative error + * The Linux kernel has two versions for cgroups: there are + * cgroups v1 and cgroups v2. Both are available to users, who can + * use a mixture of them, but note that the net_cls cgroup is for + * cgroup v1 only. This makes it incompatible with BPF programs + * run on cgroups, which is a cgroup-v2-only feature (a socket can + * only hold data for one version of cgroups at a time). * - * u64 bpf_perf_event_read(map, flags) - * read perf event counter value - * @map: pointer to perf_event_array map - * @flags: index of event in the map or bitmask flags - * Return: value of perf event counter read or error code + * This helper is only available is the kernel was compiled with + * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to + * "**y**" or to "**m**". + * Return + * The classid, or 0 for the default unconfigured classid. * - * int bpf_redirect(ifindex, flags) - * redirect to another netdev - * @ifindex: ifindex of the net device - * @flags: - * cls_bpf: - * bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * xdp_bpf: - * all bits - reserved - * Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error - * xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error - * int bpf_redirect_map(map, key, flags) - * redirect to endpoint in map - * @map: pointer to dev map - * @key: index in map to lookup - * @flags: -- - * Return: XDP_REDIRECT on success or XDP_ABORT on error + * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * Description + * Push a *vlan_tci* (VLAN tag control information) of protocol + * *vlan_proto* to the packet associated to *skb*, then update + * the checksum. Note that if *vlan_proto* is different from + * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to + * be **ETH_P_8021Q**. * - * u32 bpf_get_route_realm(skb) - * retrieve a dst's tclassid - * @skb: pointer to skb - * Return: realm if != 0 + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. * - * int bpf_perf_event_output(ctx, map, flags, data, size) - * output perf raw sample - * @ctx: struct pt_regs* - * @map: pointer to perf_event_array map - * @flags: index of event in the map or bitmask flags - * @data: data on stack to be output as raw data - * @size: size of data - * Return: 0 on success or negative error + * int bpf_skb_vlan_pop(struct sk_buff *skb) + * Description + * Pop a VLAN header from the packet associated to *skb*. * - * int bpf_get_stackid(ctx, map, flags) - * walk user or kernel stack and return id - * @ctx: struct pt_regs* - * @map: pointer to stack_trace map - * @flags: bits 0-7 - numer of stack frames to skip - * bit 8 - collect user stack instead of kernel - * bit 9 - compare stacks by hash only - * bit 10 - if two different stacks hash into the same stackid - * discard old - * other bits - reserved - * Return: >= 0 stackid on success or negative error + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. * - * s64 bpf_csum_diff(from, from_size, to, to_size, seed) - * calculate csum diff - * @from: raw from buffer - * @from_size: length of from buffer - * @to: raw to buffer - * @to_size: length of to buffer - * @seed: optional seed - * Return: csum result or negative error code + * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Get tunnel metadata. This helper takes a pointer *key* to an + * empty **struct bpf_tunnel_key** of **size**, that will be + * filled with tunnel metadata for the packet associated to *skb*. + * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which + * indicates that the tunnel is based on IPv6 protocol instead of + * IPv4. * - * int bpf_skb_get_tunnel_opt(skb, opt, size) - * retrieve tunnel options metadata - * @skb: pointer to skb - * @opt: pointer to raw tunnel option data - * @size: size of @opt - * Return: option size + * The **struct bpf_tunnel_key** is an object that generalizes the + * principal parameters used by various tunneling protocols into a + * single struct. This way, it can be used to easily make a + * decision based on the contents of the encapsulation header, + * "summarized" in this struct. In particular, it holds the IP + * address of the remote end (IPv4 or IPv6, depending on the case) + * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, + * this struct exposes the *key*\ **->tunnel_id**, which is + * generally mapped to a VNI (Virtual Network Identifier), making + * it programmable together with the **bpf_skb_set_tunnel_key**\ + * () helper. * - * int bpf_skb_set_tunnel_opt(skb, opt, size) - * populate tunnel options metadata - * @skb: pointer to skb - * @opt: pointer to raw tunnel option data - * @size: size of @opt - * Return: 0 on success or negative error + * Let's imagine that the following code is part of a program + * attached to the TC ingress interface, on one end of a GRE + * tunnel, and is supposed to filter out all messages coming from + * remote ends with IPv4 address other than 10.0.0.1: * - * int bpf_skb_change_proto(skb, proto, flags) - * Change protocol of the skb. Currently supported is v4 -> v6, - * v6 -> v4 transitions. The helper will also resize the skb. eBPF - * program is expected to fill the new headers via skb_store_bytes - * and lX_csum_replace. - * @skb: pointer to skb - * @proto: new skb->protocol type - * @flags: reserved - * Return: 0 on success or negative error + * :: * - * int bpf_skb_change_type(skb, type) - * Change packet type of skb. - * @skb: pointer to skb - * @type: new skb->pkt_type type - * Return: 0 on success or negative error + * int ret; + * struct bpf_tunnel_key key = {}; + * + * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + * if (ret < 0) + * return TC_ACT_SHOT; // drop packet + * + * if (key.remote_ipv4 != 0x0a000001) + * return TC_ACT_SHOT; // drop packet + * + * return TC_ACT_OK; // accept packet * - * int bpf_skb_under_cgroup(skb, map, index) - * Check cgroup2 membership of skb - * @skb: pointer to skb - * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type - * @index: index of the cgroup in the bpf_map - * Return: - * == 0 skb failed the cgroup2 descendant test - * == 1 skb succeeded the cgroup2 descendant test - * < 0 error + * This interface can also be used with all encapsulation devices + * that can operate in "collect metadata" mode: instead of having + * one network device per specific configuration, the "collect + * metadata" mode only requires a single device where the + * configuration can be extracted from this helper. * - * u32 bpf_get_hash_recalc(skb) - * Retrieve and possibly recalculate skb->hash. - * @skb: pointer to skb - * Return: hash + * This can be used together with various tunnels such as VXLan, + * Geneve, GRE or IP in IP (IPIP). + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Populate tunnel metadata for packet associated to *skb.* The + * tunnel metadata is set to the contents of *key*, of *size*. The + * *flags* can be set to a combination of the following values: + * + * **BPF_F_TUNINFO_IPV6** + * Indicate that the tunnel is based on IPv6 protocol + * instead of IPv4. + * **BPF_F_ZERO_CSUM_TX** + * For IPv4 packets, add a flag to tunnel metadata + * indicating that checksum computation should be skipped + * and checksum set to zeroes. + * **BPF_F_DONT_FRAGMENT** + * Add a flag to tunnel metadata indicating that the + * packet should not be fragmented. + * **BPF_F_SEQ_NUMBER** + * Add a flag to tunnel metadata indicating that a + * sequence number should be added to tunnel header before + * sending the packet. This flag was added for GRE + * encapsulation, but might be used with other protocols + * as well in the future. + * + * Here is a typical usage on the transmit path: + * + * :: + * + * struct bpf_tunnel_key key; + * populate key ... + * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); + * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); + * + * See also the description of the **bpf_skb_get_tunnel_key**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) + * Description + * Read the value of a perf event counter. This helper relies on a + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of + * the perf event counter is selected when *map* is updated with + * perf event file descriptors. The *map* is an array whose size + * is the number of available CPUs, and each cell contains a value + * relative to one CPU. The value to retrieve is indicated by + * *flags*, that contains the index of the CPU to look up, masked + * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * Note that before Linux 4.13, only hardware perf event can be + * retrieved. + * + * Also, be aware that the newer helper + * **bpf_perf_event_read_value**\ () is recommended over + * **bpf_perf_event_read**\ () in general. The latter has some ABI + * quirks where error and counter value are used as a return code + * (which is wrong to do since ranges may overlap). This issue is + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of + * **bpf_perf_event_read_value**\ () for details. + * Return + * The value of the perf event counter read from the map, or a + * negative error code in case of failure. + * + * int bpf_redirect(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_clone_redirect**\ + * (), except that the packet is not cloned, which provides + * increased performance. + * + * Except for XDP, both ingress and egress interfaces can be used + * for redirection. The **BPF_F_INGRESS** value in *flags* is used + * to make the distinction (ingress path is selected if the flag + * is present, egress path otherwise). Currently, XDP only + * supports redirection to the egress interface, and accepts no + * flag at all. + * + * The same effect can be attained with the more generic + * **bpf_redirect_map**\ (), which requires specific maps to be + * used but offers better performance. + * Return + * For XDP, the helper returns **XDP_REDIRECT** on success or + * **XDP_ABORTED** on error. For other program types, the values + * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on + * error. + * + * u32 bpf_get_route_realm(struct sk_buff *skb) + * Description + * Retrieve the realm or the route, that is to say the + * **tclassid** field of the destination for the *skb*. The + * indentifier retrieved is a user-provided tag, similar to the + * one used with the net_cls cgroup (see description for + * **bpf_get_cgroup_classid**\ () helper), but here this tag is + * held by a route (a destination entry), not by a task. + * + * Retrieving this identifier works with the clsact TC egress hook + * (see also **tc-bpf(8)**), or alternatively on conventional + * classful egress qdiscs, but not on TC ingress path. In case of + * clsact TC egress hook, this has the advantage that, internally, + * the destination entry has not been dropped yet in the transmit + * path. Therefore, the destination entry does not need to be + * artificially held via **netif_keep_dst**\ () for a classful + * qdisc until the *skb* is freed. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_IP_ROUTE_CLASSID** configuration option. + * Return + * The realm of the route for the packet associated to *skb*, or 0 + * if none was found. + * + * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * The context of the program *ctx* needs also be passed to the + * helper. + * + * On user space, a program willing to read the values needs to + * call **perf_event_open**\ () on the perf event (either for + * one or for all CPUs) and to store the file descriptor into the + * *map*. This must be done before the eBPF program can send data + * into it. An example is available in file + * *samples/bpf/trace_output_user.c* in the Linux kernel source + * tree (the eBPF program counterpart is in + * *samples/bpf/trace_output_kern.c*). + * + * **bpf_perf_event_output**\ () achieves better performance + * than **bpf_trace_printk**\ () for sharing data with user + * space, and is much better suitable for streaming data from eBPF + * programs. + * + * Note that this helper is not restricted to tracing use cases + * and can be used with programs attached to TC or XDP as well, + * where it allows for passing data to user space listeners. Data + * can be: + * + * * Only custom structs, + * * Only the packet payload, or + * * A combination of both. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) + * Description + * This helper was provided as an easy way to load data from a + * packet. It can be used to load *len* bytes from *offset* from + * the packet associated to *skb*, into the buffer pointed by + * *to*. + * + * Since Linux 4.7, usage of this helper has mostly been replaced + * by "direct packet access", enabling packet data to be + * manipulated with *skb*\ **->data** and *skb*\ **->data_end** + * pointing respectively to the first byte of packet data and to + * the byte after the last byte of packet data. However, it + * remains useful if one wishes to read large quantities of data + * at once from a packet into the eBPF stack. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags) + * Description + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *ctx*, which is a pointer to the context + * on which the tracing program is executed, and a pointer to a + * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Return + * The positive or null stack id on success, or a negative error + * in case of failure. + * + * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) + * Description + * Compute a checksum difference, from the raw buffer pointed by + * *from*, of length *from_size* (that must be a multiple of 4), + * towards the raw buffer pointed by *to*, of size *to_size* + * (same remark). An optional *seed* can be added to the value + * (this can be cascaded, the seed may come from a previous call + * to the helper). + * + * This is flexible enough to be used in several ways: + * + * * With *from_size* == 0, *to_size* > 0 and *seed* set to + * checksum, it can be used when pushing new data. + * * With *from_size* > 0, *to_size* == 0 and *seed* set to + * checksum, it can be used when removing data from a packet. + * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it + * can be used to compute a diff. Note that *from_size* and + * *to_size* do not need to be equal. + * + * This helper can be used in combination with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to + * which one can feed in the difference computed with + * **bpf_csum_diff**\ (). + * Return + * The checksum result, or a negative error code in case of + * failure. + * + * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * Description + * Retrieve tunnel options metadata for the packet associated to + * *skb*, and store the raw tunnel option data to the buffer *opt* + * of *size*. + * + * This helper can be used with encapsulation devices that can + * operate in "collect metadata" mode (please refer to the related + * note in the description of **bpf_skb_get_tunnel_key**\ () for + * more details). A particular example where this can be used is + * in combination with the Geneve encapsulation protocol, where it + * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) + * and retrieving arbitrary TLVs (Type-Length-Value headers) from + * the eBPF program. This allows for full customization of these + * headers. + * Return + * The size of the option data retrieved. + * + * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * Description + * Set tunnel options metadata for the packet associated to *skb* + * to the option data contained in the raw buffer *opt* of *size*. + * + * See also the description of the **bpf_skb_get_tunnel_opt**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * Description + * Change the protocol of the *skb* to *proto*. Currently + * supported are transition from IPv4 to IPv6, and from IPv6 to + * IPv4. The helper takes care of the groundwork for the + * transition, including resizing the socket buffer. The eBPF + * program is expected to fill the new headers, if any, via + * **skb_store_bytes**\ () and to recompute the checksums with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ + * (). The main case for this helper is to perform NAT64 + * operations out of an eBPF program. + * + * Internally, the GSO type is marked as dodgy so that headers are + * checked and segments are recalculated by the GSO/GRO engine. + * The size for GSO target is adapted as well. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_change_type(struct sk_buff *skb, u32 type) + * Description + * Change the packet type for the packet associated to *skb*. This + * comes down to setting *skb*\ **->pkt_type** to *type*, except + * the eBPF program does not have a write access to *skb*\ + * **->pkt_type** beside this helper. Using a helper here allows + * for graceful handling of errors. + * + * The major use case is to change incoming *skb*s to + * **PACKET_HOST** in a programmatic way instead of having to + * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for + * example. + * + * Note that *type* only allows certain values. At this time, they + * are: + * + * **PACKET_HOST** + * Packet is for us. + * **PACKET_BROADCAST** + * Send packet to all. + * **PACKET_MULTICAST** + * Send packet to group. + * **PACKET_OTHERHOST** + * Send packet to someone else. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * Description + * Check whether *skb* is a descendant of the cgroup2 held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* failed the cgroup2 descendant test. + * * 1, if the *skb* succeeded the cgroup2 descendant test. + * * A negative error code, if an error occurred. + * + * u32 bpf_get_hash_recalc(struct sk_buff *skb) + * Description + * Retrieve the hash of the packet, *skb*\ **->hash**. If it is + * not set, in particular if the hash was cleared due to mangling, + * recompute this hash. Later accesses to the hash can be done + * directly with *skb*\ **->hash**. + * + * Calling **bpf_set_hash_invalid**\ (), changing a packet + * prototype with **bpf_skb_change_proto**\ (), or calling + * **bpf_skb_store_bytes**\ () with the + * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear + * the hash and to trigger a new computation for the next call to + * **bpf_get_hash_recalc**\ (). + * Return + * The 32-bit hash. * * u64 bpf_get_current_task(void) - * Returns current task_struct - * Return: current + * Return + * A pointer to the current task struct. * - * int bpf_probe_write_user(void *dst, void *src, int len) - * safely attempt to write to a location - * @dst: destination address in userspace - * @src: source address on stack - * @len: number of bytes to copy - * Return: 0 on success or negative error + * int bpf_probe_write_user(void *dst, const void *src, u32 len) + * Description + * Attempt in a safe way to write *len* bytes from the buffer + * *src* to *dst* in memory. It only works for threads that are in + * user context, and *dst* must be a valid user space address. * - * int bpf_current_task_under_cgroup(map, index) - * Check cgroup2 membership of current task - * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type - * @index: index of the cgroup in the bpf_map - * Return: - * == 0 current failed the cgroup2 descendant test - * == 1 current succeeded the cgroup2 descendant test - * < 0 error + * This helper should not be used to implement any kind of + * security mechanism because of TOC-TOU attacks, but rather to + * debug, divert, and manipulate execution of semi-cooperative + * processes. * - * int bpf_skb_change_tail(skb, len, flags) - * The helper will resize the skb to the given new size, to be used f.e. - * with control messages. - * @skb: pointer to skb - * @len: new skb length - * @flags: reserved - * Return: 0 on success or negative error + * Keep in mind that this feature is meant for experiments, and it + * has a risk of crashing the system and running programs. + * Therefore, when an eBPF program using this helper is attached, + * a warning including PID and process name is printed to kernel + * logs. + * Return + * 0 on success, or a negative error in case of failure. * - * int bpf_skb_pull_data(skb, len) - * The helper will pull in non-linear data in case the skb is non-linear - * and not all of len are part of the linear section. Only needed for - * read/write with direct packet access. - * @skb: pointer to skb - * @len: len to make read/writeable - * Return: 0 on success or negative error + * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * Description + * Check whether the probe is being run is the context of a given + * subset of the cgroup2 hierarchy. The cgroup2 to test is held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: * - * s64 bpf_csum_update(skb, csum) - * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. - * @skb: pointer to skb - * @csum: csum to add - * Return: csum on success or negative error + * * 0, if the *skb* task belongs to the cgroup2. + * * 1, if the *skb* task does not belong to the cgroup2. + * * A negative error code, if an error occurred. * - * void bpf_set_hash_invalid(skb) - * Invalidate current skb->hash. - * @skb: pointer to skb + * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Resize (trim or grow) the packet associated to *skb* to the + * new *len*. The *flags* are reserved for future usage, and must + * be left at zero. * - * int bpf_get_numa_node_id() - * Return: Id of current NUMA node. + * The basic idea is that the helper performs the needed work to + * change the size of the packet, then the eBPF program rewrites + * the rest via helpers like **bpf_skb_store_bytes**\ (), + * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () + * and others. This helper is a slow path utility intended for + * replies with control messages. And because it is targeted for + * slow path, the helper itself can afford to be slow: it + * implicitly linearizes, unclones and drops offloads from the + * *skb*. * - * int bpf_skb_change_head() - * Grows headroom of skb and adjusts MAC header offset accordingly. - * Will extends/reallocae as required automatically. - * May change skb data pointer and will thus invalidate any check - * performed for direct packet access. - * @skb: pointer to skb - * @len: length of header to be pushed in front - * @flags: Flags (unused for now) - * Return: 0 on success or negative error + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_head(xdp_md, delta) - * Adjust the xdp_md.data by delta - * @xdp_md: pointer to xdp_md - * @delta: An positive/negative integer to be added to xdp_md.data - * Return: 0 on success or negative on error + * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * Description + * Pull in non-linear data in case the *skb* is non-linear and not + * all of *len* are part of the linear section. Make *len* bytes + * from *skb* readable and writable. If a zero value is passed for + * *len*, then the whole length of the *skb* is pulled. + * + * This helper is only needed for reading and writing with direct + * packet access. + * + * For direct packet access, testing that offsets to access + * are within packet boundaries (test on *skb*\ **->data_end**) is + * susceptible to fail if offsets are invalid, or if the requested + * data is in non-linear parts of the *skb*. On failure the + * program can just bail out, or in the case of a non-linear + * buffer, use a helper to make the data available. The + * **bpf_skb_load_bytes**\ () helper is a first solution to access + * the data. Another one consists in using **bpf_skb_pull_data** + * to pull in once the non-linear parts, then retesting and + * eventually access the data. + * + * At the same time, this also makes sure the *skb* is uncloned, + * which is a necessary condition for direct write. As this needs + * to be an invariant for the write part only, the verifier + * detects writes and adds a prologue that is calling + * **bpf_skb_pull_data()** to effectively unclone the *skb* from + * the very beginning in case it is indeed cloned. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) + * Description + * Add the checksum *csum* into *skb*\ **->csum** in case the + * driver has supplied a checksum for the entire packet into that + * field. Return an error otherwise. This helper is intended to be + * used in combination with **bpf_csum_diff**\ (), in particular + * when the checksum needs to be updated after data has been + * written into the packet through direct packet access. + * Return + * The checksum on success, or a negative error code in case of + * failure. + * + * void bpf_set_hash_invalid(struct sk_buff *skb) + * Description + * Invalidate the current *skb*\ **->hash**. It can be used after + * mangling on headers through direct packet access, in order to + * indicate that the hash is outdated and to trigger a + * recalculation the next time the kernel tries to access this + * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * + * int bpf_get_numa_node_id(void) + * Description + * Return the id of the current NUMA node. The primary use case + * for this helper is the selection of sockets for the local NUMA + * node, when the program is attached to sockets using the + * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), + * but the helper is also available to other eBPF program types, + * similarly to **bpf_get_smp_processor_id**\ (). + * Return + * The id of current NUMA node. + * + * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Grows headroom of packet associated to *skb* and adjusts the + * offset of the MAC header accordingly, adding *len* bytes of + * space. It automatically extends and reallocates memory as + * required. + * + * This helper can be used on a layer 3 *skb* to push a MAC header + * for redirection into a layer 2 device. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that + * it is possible to use a negative value for *delta*. This helper + * can be used to prepare the packet for pushing or popping + * headers. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. * * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) - * Copy a NUL terminated string from unsafe address. In case the string - * length is smaller than size, the target is not padded with further NUL - * bytes. In case the string length is larger than size, just count-1 - * bytes are copied and the last byte is set to NUL. - * @dst: destination address - * @size: maximum number of bytes to copy, including the trailing NUL - * @unsafe_ptr: unsafe address - * Return: - * > 0 length of the string including the trailing NUL on success - * < 0 error + * Description + * Copy a NUL terminated string from an unsafe address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. * - * u64 bpf_get_socket_cookie(skb) - * Get the cookie for the socket stored inside sk_buff. - * @skb: pointer to skb - * Return: 8 Bytes non-decreasing number on success or 0 if the socket - * field is missing inside sk_buff + * On success, the length of the copied string is returned. This + * makes this helper useful in tracing programs for reading + * strings, and more importantly to get its length at runtime. See + * the following snippet: * - * u32 bpf_get_socket_uid(skb) - * Get the owner uid of the socket stored inside sk_buff. - * @skb: pointer to skb - * Return: uid of the socket owner on success or overflowuid if failed. + * :: * - * u32 bpf_set_hash(skb, hash) - * Set full skb->hash. - * @skb: pointer to skb - * @hash: hash to set + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_str(buf, sizeof(buf), + * ctx->di); * - * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) - * Calls setsockopt. Not all opts are available, only those with - * integer optvals plus TCP_CONGESTION. - * Supported levels: SOL_SOCKET and IPPROTO_TCP - * @bpf_socket: pointer to bpf_socket - * @level: SOL_SOCKET or IPPROTO_TCP - * @optname: option name - * @optval: pointer to option value - * @optlen: length of optval in bytes - * Return: 0 or negative error + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } * - * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen) - * Calls getsockopt. Not all opts are available. - * Supported levels: IPPROTO_TCP - * @bpf_socket: pointer to bpf_socket - * @level: IPPROTO_TCP - * @optname: option name - * @optval: pointer to option value - * @optlen: length of optval in bytes - * Return: 0 or negative error + * In comparison, using **bpf_probe_read()** helper here instead + * to read the string would require to estimate the length at + * compile time, and would often result in copying more memory + * than necessary. * - * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) - * Set callback flags for sock_ops - * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct - * @flags: flags value - * Return: 0 for no error - * -EINVAL if there is no full tcp socket - * bits in flags that are not supported by current kernel + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. * - * int bpf_skb_adjust_room(skb, len_diff, mode, flags) - * Grow or shrink room in sk_buff. - * @skb: pointer to skb - * @len_diff: (signed) amount of room to grow/shrink - * @mode: operation mode (enum bpf_adj_room_mode) - * @flags: reserved for future use - * Return: 0 on success or negative error code + * u64 bpf_get_socket_cookie(struct sk_buff *skb) + * Description + * If the **struct sk_buff** pointed by *skb* has a known socket, + * retrieve the cookie (generated by the kernel) of this socket. + * If no cookie has been set yet, generate a new cookie. Once + * generated, the socket cookie remains stable for the life of the + * socket. This helper can be useful for monitoring per socket + * networking traffic statistics as it provides a unique socket + * identifier per namespace. + * Return + * A 8-byte long non-decreasing number on success, or 0 if the + * socket field is missing inside *skb*. * - * int bpf_sk_redirect_map(map, key, flags) - * Redirect skb to a sock in map using key as a lookup key for the - * sock in map. - * @map: pointer to sockmap - * @key: key to lookup sock in map - * @flags: reserved for future use - * Return: SK_PASS + * u32 bpf_get_socket_uid(struct sk_buff *skb) + * Return + * The owner UID of the socket associated to *skb*. If the socket + * is **NULL**, or if it is not a full socket (i.e. if it is a + * time-wait or a request socket instead), **overflowuid** value + * is returned (note that **overflowuid** might also be the actual + * UID value for the socket). * - * int bpf_sock_map_update(skops, map, key, flags) - * @skops: pointer to bpf_sock_ops - * @map: pointer to sockmap to update - * @key: key to insert/update sock in map - * @flags: same flags as map update elem + * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) + * Description + * Set the full hash for *skb* (set the field *skb*\ **->hash**) + * to value *hash*. + * Return + * 0 * - * int bpf_xdp_adjust_meta(xdp_md, delta) - * Adjust the xdp_md.data_meta by delta - * @xdp_md: pointer to xdp_md - * @delta: An positive/negative integer to be added to xdp_md.data_meta - * Return: 0 on success or negative on error + * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) + * Description + * Emulate a call to **setsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **setsockopt(2)** for more information. + * The option value of length *optlen* is pointed by *optval*. * - * int bpf_perf_event_read_value(map, flags, buf, buf_size) - * read perf event counter value and perf event enabled/running time - * @map: pointer to perf_event_array map - * @flags: index of event in the map or bitmask flags - * @buf: buf to fill - * @buf_size: size of the buf - * Return: 0 on success or negative error code + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: * - * int bpf_perf_prog_read_value(ctx, buf, buf_size) - * read perf prog attached perf event counter and enabled/running time - * @ctx: pointer to ctx - * @buf: buf to fill - * @buf_size: size of the buf - * Return : 0 on success or negative error code + * * **SOL_SOCKET**, which supports the following *optname*\ s: + * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. + * * **IPPROTO_TCP**, which supports the following *optname*\ s: + * **TCP_CONGESTION**, **TCP_BPF_IW**, + * **TCP_BPF_SNDCWND_CLAMP**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * Return + * 0 on success, or a negative error in case of failure. * - * int bpf_override_return(pt_regs, rc) - * @pt_regs: pointer to struct pt_regs - * @rc: the return value to set + * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * Description + * Grow or shrink the room for data in the packet associated to + * *skb* by *len_diff*, and according to the selected *mode*. * - * int bpf_msg_redirect_map(map, key, flags) - * Redirect msg to a sock in map using key as a lookup key for the - * sock in map. - * @map: pointer to sockmap - * @key: key to lookup sock in map - * @flags: reserved for future use - * Return: SK_PASS + * There is a single supported mode at this time: + * + * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer + * (room space is added or removed below the layer 3 header). + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the endpoint referenced by *map* at + * index *key*. Depending on its type, this *map* can contain + * references to net devices (for forwarding packets through other + * ports), or to CPUs (for redirecting XDP frames to another CPU; + * but this is only implemented for native XDP (with driver + * support) as of this writing). + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * When used to redirect packets to net devices, this helper + * provides a high performance increase over **bpf_redirect**\ (). + * This is due to various implementation details of the underlying + * mechanisms, one of which is the fact that **bpf_redirect_map**\ + * () tries to send packet as a "bulk" to the device. + * Return + * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. + * + * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a *map* referencing sockets. The + * *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust the address pointed by *xdp_md*\ **->data_meta** by + * *delta* (which can be positive or negative). Note that this + * operation modifies the address stored in *xdp_md*\ **->data**, + * so the latter must be loaded only after the helper has been + * called. + * + * The use of *xdp_md*\ **->data_meta** is optional and programs + * are not required to use it. The rationale is that when the + * packet is processed with XDP (e.g. as DoS filter), it is + * possible to push further meta data along with it before passing + * to the stack, and to give the guarantee that an ingress eBPF + * program attached as a TC classifier on the same device can pick + * this up for further post-processing. Since TC works with socket + * buffers, it remains possible to set from XDP the **mark** or + * **priority** pointers, or other pointers for the socket buffer. + * Having this scratch space generic and programmable allows for + * more flexibility as the user is free to store whatever meta + * data they need. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * Read the value of a perf event counter, and store it into *buf* + * of size *buf_size*. This helper relies on a *map* of type + * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event + * counter is selected when *map* is updated with perf event file + * descriptors. The *map* is an array whose size is the number of + * available CPUs, and each cell contains a value relative to one + * CPU. The value to retrieve is indicated by *flags*, that + * contains the index of the CPU to look up, masked with + * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * This helper behaves in a way close to + * **bpf_perf_event_read**\ () helper, save that instead of + * just returning the value observed, it fills the *buf* + * structure. This allows for additional data to be retrieved: in + * particular, the enabled and running times (in *buf*\ + * **->enabled** and *buf*\ **->running**, respectively) are + * copied. In general, **bpf_perf_event_read_value**\ () is + * recommended over **bpf_perf_event_read**\ (), which has some + * ABI issues and provides fewer functionalities. + * + * These values are interesting, because hardware PMU (Performance + * Monitoring Unit) counters are limited resources. When there are + * more PMU based perf events opened than available counters, + * kernel will multiplex these events so each event gets certain + * percentage (but not all) of the PMU time. In case that + * multiplexing happens, the number of samples or counter value + * will not reflect the case compared to when no multiplexing + * occurs. This makes comparison between different runs difficult. + * Typically, the counter value should be normalized before + * comparing to other experiments. The usual normalization is done + * as follows. + * + * :: + * + * normalized_counter = counter * t_enabled / t_running + * + * Where t_enabled is the time enabled for event and t_running is + * the time running for event since last normalization. The + * enabled and running times are accumulated since the perf event + * open. To achieve scaling factor between two invocations of an + * eBPF program, users can can use CPU id as the key (which is + * typical for perf array usage model) to remember the previous + * value and do the calculation inside the eBPF program. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * For en eBPF program attached to a perf event, retrieve the + * value of the event counter associated to *ctx* and store it in + * the structure pointed by *buf* and of size *buf_size*. Enabled + * and running times are also stored in the structure (see + * description of helper **bpf_perf_event_read_value**\ () for + * more details). + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) + * Description + * Emulate a call to **getsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **getsockopt(2)** for more information. + * The retrieved value is stored in the structure pointed by + * *opval* and of length *optlen*. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the following *level*\ s: + * + * * **IPPROTO_TCP**, which supports *optname* + * **TCP_CONGESTION**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_override_return(struct pt_reg *regs, u64 rc) + * Description + * Used for error injection, this helper uses kprobes to override + * the return value of the probed function, and to set it to *rc*. + * The first argument is the context *regs* on which the kprobe + * works. + * + * This helper works by setting setting the PC (program counter) + * to an override function which is run in place of the original + * probed function. This means the probed function is not run at + * all. The replacement function just returns with the required + * value. + * + * This helper has security implications, and thus is subject to + * restrictions. It is only available if the kernel was compiled + * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration + * option, and in this case it only works on functions tagged with + * **ALLOW_ERROR_INJECTION** in the kernel code. + * + * Also, the helper is only available for the architectures having + * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, + * x86 architecture is the only one to support this feature. + * Return + * 0 + * + * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * Description + * Attempt to set the value of the **bpf_sock_ops_cb_flags** field + * for the full TCP socket associated to *bpf_sock_ops* to + * *argval*. + * + * The primary use of this field is to determine if there should + * be calls to eBPF programs of type + * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP + * code. A program of the same type can change its value, per + * connection and as necessary, when the connection is + * established. This field is directly accessible for reading, but + * this helper must be used for updates in order to return an + * error if an eBPF program tries to set a callback that is not + * supported in the current kernel. + * + * The supported callback values that *argval* can combine are: + * + * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) + * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) + * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * + * Here are some examples of where one could call such eBPF + * program: + * + * * When RTO fires. + * * When a packet is retransmitted. + * * When the connection terminates. + * * When a packet is sent. + * * When a packet is received. + * Return + * Code **-EINVAL** if the socket is not a full TCP socket; + * otherwise, a positive number containing the bits that could not + * be set is returned (which comes down to 0 if all bits were set + * as required). + * + * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, apply the verdict of the eBPF program to + * the next *bytes* (number of bytes) of message *msg*. + * + * For example, this helper can be used in the following cases: + * + * * A single **sendmsg**\ () or **sendfile**\ () system call + * contains multiple logical messages that the eBPF program is + * supposed to read and for which it should apply a verdict. + * * An eBPF program only cares to read the first *bytes* of a + * *msg*. If the message has a large payload, then setting up + * and calling the eBPF program repeatedly for all bytes, even + * though the verdict is already known, would create unnecessary + * overhead. + * + * When called from within an eBPF program, the helper sets a + * counter internal to the BPF infrastructure, that is used to + * apply the last verdict to the next *bytes*. If *bytes* is + * smaller than the current data being processed from a + * **sendmsg**\ () or **sendfile**\ () system call, the first + * *bytes* will be sent and the eBPF program will be re-run with + * the pointer for start of data pointing to byte number *bytes* + * **+ 1**. If *bytes* is larger than the current data being + * processed, then the eBPF verdict will be applied to multiple + * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are + * consumed. + * + * Note that if a socket closes with the internal counter holding + * a non-zero value, this is not a problem because data is not + * being buffered for *bytes* and is sent as it is received. + * Return + * 0 + * + * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, prevent the execution of the verdict eBPF + * program for message *msg* until *bytes* (byte number) have been + * accumulated. + * + * This can be used when one needs a specific number of bytes + * before a verdict can be assigned, even if the data spans + * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme + * case would be a user calling **sendmsg**\ () repeatedly with + * 1-byte long message segments. Obviously, this is bad for + * performance, but it is still valid. If the eBPF program needs + * *bytes* bytes to validate a header, this helper can be used to + * prevent the eBPF program to be called again until *bytes* have + * been accumulated. + * Return + * 0 + * + * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * Description + * For socket policies, pull in non-linear data from user space + * for *msg* and set pointers *msg*\ **->data** and *msg*\ + * **->data_end** to *start* and *end* bytes offsets into *msg*, + * respectively. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it can only parse data that the (**data**, **data_end**) + * pointers have already consumed. For **sendmsg**\ () hooks this + * is likely the first scatterlist element. But for calls relying + * on the **sendpage** handler (e.g. **sendfile**\ ()) this will + * be the range (**0**, **0**) because the data is shared with + * user space and by default the objective is to avoid allowing + * user space to modify data while (or after) eBPF verdict is + * being decided. This helper can be used to pull in data and to + * set the start and end pointer to given values. Data will be + * copied if necessary (i.e. if data was not linear and if start + * and end pointers do not point to the same chunk). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * Description + * Bind the socket associated to *ctx* to the address pointed by + * *addr*, of length *addr_len*. This allows for making outgoing + * connection from the desired IP address, which can be useful for + * example when all processes inside a cgroup should use one + * single IP address on a host that has multiple IP configured. + * + * This helper works for IPv4 and IPv6, TCP and UDP sockets. The + * domain (*addr*\ **->sa_family**) must be **AF_INET** (or + * **AF_INET6**). Looking for a free port to bind to can be + * expensive, therefore binding to port is not permitted by the + * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) + * must be set to zero. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is + * only possible to shrink the packet as of this writing, + * therefore *delta* must be a negative integer. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * Description + * Retrieve the XFRM state (IP transform framework, see also + * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. + * + * The retrieved value is stored in the **struct bpf_xfrm_state** + * pointed by *xfrm_state* and of length *size*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_XFRM** configuration option. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Return + * a non-negative value equal to or less than size on success, or + * a negative error in case of failure. + * + * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) + * Description + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * + * Return + * 0 on success, or a negative error in case of failure. * - * int bpf_bind(ctx, addr, addr_len) - * Bind socket to address. Only binding to IP is supported, no port can be - * set in addr. - * @ctx: pointer to context of type bpf_sock_addr - * @addr: pointer to struct sockaddr to bind socket to - * @addr_len: length of sockaddr structure - * Return: 0 on success or negative error code */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -821,7 +1892,11 @@ union bpf_attr { FN(msg_apply_bytes), \ FN(msg_cork_bytes), \ FN(msg_pull_data), \ - FN(bind), + FN(bind), \ + FN(xdp_adjust_tail), \ + FN(skb_get_xfrm_state), \ + FN(get_stack), \ + FN(skb_load_bytes_relative), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -855,11 +1930,14 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ #define BPF_F_TUNINFO_IPV6 (1ULL << 0) -/* BPF_FUNC_get_stackid flags. */ +/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ #define BPF_F_SKIP_FIELD_MASK 0xffULL #define BPF_F_USER_STACK (1ULL << 8) +/* flags used by BPF_FUNC_get_stackid only. */ #define BPF_F_FAST_STACK_CMP (1ULL << 9) #define BPF_F_REUSE_STACKID (1ULL << 10) +/* flags used by BPF_FUNC_get_stack only. */ +#define BPF_F_USER_BUILD_ID (1ULL << 11) /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) @@ -879,6 +1957,12 @@ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, }; +/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ +enum bpf_hdr_start_off { + BPF_HDR_START_MAC, + BPF_HDR_START_NET, +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -927,6 +2011,19 @@ struct bpf_tunnel_key { __u32 tunnel_label; }; +/* user accessible mirror of in-kernel xfrm_state. + * new fields can only be added to the end of this structure + */ +struct bpf_xfrm_state { + __u32 reqid; + __u32 spi; /* Stored in network byte order */ + __u16 family; + union { + __u32 remote_ipv4; /* Stored in network byte order */ + __u32 remote_ipv6[4]; /* Stored in network byte order */ + }; +}; + /* Generic BPF return codes which all BPF program types may support. * The values are binary compatible with their TC_ACT_* counter-part to * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT @@ -1017,6 +2114,7 @@ struct bpf_prog_info { __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; + __u32 gpl_compatible:1; __u64 netns_dev; __u64 netns_ino; } __attribute__((aligned(8))); diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index fc0e2650..2e766cf3 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -122,6 +122,10 @@ enum { #define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ #define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ #define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ +#define TCP_ZEROCOPY_RECEIVE 35 +#define TCP_INQ 36 /* Notify bytes available to read as a cmsg on read */ + +#define TCP_CM_INQ TCP_INQ struct tcp_repair_opt { __u32 opt_code; @@ -276,4 +280,11 @@ struct tcp_diag_md5sig { __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; }; +/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ + +struct tcp_zerocopy_receive { + __u64 address; /* in: address of mapping */ + __u32 length; /* in/out: number of bytes to map/mapped */ + __u32 recv_skip_hint; /* out: amount of bytes to skip */ +}; #endif /* _LINUX_TCP_H */ diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 88a7251b..7a166a0f 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -209,16 +209,16 @@ struct tipc_group_req { * The string formatting for each name element is: * media: media * interface: media:interface name - * link: Z.C.N:interface-Z.C.N:interface - * + * link: node:interface-node:interface */ - +#define TIPC_NODEID_LEN 16 #define TIPC_MAX_MEDIA_NAME 16 #define TIPC_MAX_IF_NAME 16 #define TIPC_MAX_BEARER_NAME 32 #define TIPC_MAX_LINK_NAME 68 -#define SIOCGETLINKNAME SIOCPROTOPRIVATE +#define SIOCGETLINKNAME SIOCPROTOPRIVATE +#define SIOCGETNODEID (SIOCPROTOPRIVATE + 1) struct tipc_sioc_ln_req { __u32 peer; @@ -226,6 +226,10 @@ struct tipc_sioc_ln_req { char linkname[TIPC_MAX_LINK_NAME]; }; +struct tipc_sioc_nodeid_req { + __u32 peer; + char node_id[TIPC_NODEID_LEN]; +}; /* The macros and functions below are deprecated: */ diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 0affb682..85c11982 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -266,6 +266,7 @@ enum { TIPC_NLA_PROP_PRIO, /* u32 */ TIPC_NLA_PROP_TOL, /* u32 */ TIPC_NLA_PROP_WIN, /* u32 */ + TIPC_NLA_PROP_MTU, /* u32 */ __TIPC_NLA_PROP_MAX, TIPC_NLA_PROP_MAX = __TIPC_NLA_PROP_MAX - 1 From 7d40bdbc8d7aba43e4c416585cd889c169018d58 Mon Sep 17 00:00:00 2001 From: GhantaKrishnamurthy MohanKrishna Date: Tue, 8 May 2018 13:55:28 +0200 Subject: [PATCH 13/28] tipc: Add support to set and get MTU for UDP bearer In this commit we introduce the ability to set and get MTU for UDP media and bearer. For set and get properties such as tolerance, window and priority, we already do: $ tipc media set PPROPERTY media MEDIA $ tipc media get PPROPERTY media MEDIA $ tipc bearer set OPTION media MEDIA ARGS $ tipc bearer get [OPTION] media MEDIA ARGS The same has been extended for MTU, with an exception to support only media type UDP. Acked-by: Jon Maloy Signed-off-by: GhantaKrishnamurthy MohanKrishna Signed-off-by: David Ahern --- tipc/bearer.c | 55 ++++++++++++++++++++++++++++++++++++++++++--------- tipc/media.c | 24 ++++++++++++++++++++-- 2 files changed, 68 insertions(+), 11 deletions(-) diff --git a/tipc/bearer.c b/tipc/bearer.c index 0d845701..05dc84aa 100644 --- a/tipc/bearer.c +++ b/tipc/bearer.c @@ -42,7 +42,8 @@ static void _print_bearer_opts(void) "OPTIONS\n" " priority - Bearer link priority\n" " tolerance - Bearer link tolerance\n" - " window - Bearer link window\n"); + " window - Bearer link window\n" + " mtu - Bearer link mtu\n"); } void print_bearer_media(void) @@ -194,6 +195,21 @@ static int nl_add_udp_enable_opts(struct nlmsghdr *nlh, struct opt *opts, return 0; } +static char *cmd_get_media_type(const struct cmd *cmd, struct cmdl *cmdl, + struct opt *opts) +{ + struct opt *opt = get_opt(opts, "media"); + + if (!opt) { + if (help_flag) + (cmd->help)(cmdl); + else + fprintf(stderr, "error, missing bearer media\n"); + return NULL; + } + return opt->val; +} + static int nl_add_bearer_name(struct nlmsghdr *nlh, const struct cmd *cmd, struct cmdl *cmdl, struct opt *opts, const struct tipc_sup_media *sup_media) @@ -217,15 +233,8 @@ int cmd_get_unique_bearer_name(const struct cmd *cmd, struct cmdl *cmdl, struct opt *opt; const struct tipc_sup_media *entry; - - if (!(opt = get_opt(opts, "media"))) { - if (help_flag) - (cmd->help)(cmdl); - else - fprintf(stderr, "error, missing bearer media\n"); + if (!(media = cmd_get_media_type(cmd, cmdl, opts))) return -EINVAL; - } - media = opt->val; for (entry = sup_media; entry->media; entry++) { if (strcmp(entry->media, media)) @@ -559,6 +568,8 @@ static int cmd_bearer_set_prop(struct nlmsghdr *nlh, const struct cmd *cmd, prop = TIPC_NLA_PROP_TOL; else if ((strcmp(cmd->cmd, "window") == 0)) prop = TIPC_NLA_PROP_WIN; + else if ((strcmp(cmd->cmd, "mtu") == 0)) + prop = TIPC_NLA_PROP_MTU; else return -EINVAL; @@ -571,6 +582,17 @@ static int cmd_bearer_set_prop(struct nlmsghdr *nlh, const struct cmd *cmd, if (parse_opts(opts, cmdl) < 0) return -EINVAL; + if (prop == TIPC_NLA_PROP_MTU) { + char *media = cmd_get_media_type(cmd, cmdl, opts); + + if (!media) + return -EINVAL; + else if (strcmp(media, "udp")) { + fprintf(stderr, "error, not supported for media\n"); + return -EINVAL; + } + } + if (!(nlh = msg_init(buf, TIPC_NL_BEARER_SET))) { fprintf(stderr, "error, message initialisation failed\n"); return -1; @@ -597,6 +619,7 @@ static int cmd_bearer_set(struct nlmsghdr *nlh, const struct cmd *cmd, { "priority", cmd_bearer_set_prop, cmd_bearer_set_help }, { "tolerance", cmd_bearer_set_prop, cmd_bearer_set_help }, { "window", cmd_bearer_set_prop, cmd_bearer_set_help }, + { "mtu", cmd_bearer_set_prop, cmd_bearer_set_help }, { NULL } }; @@ -877,12 +900,25 @@ static int cmd_bearer_get_prop(struct nlmsghdr *nlh, const struct cmd *cmd, prop = TIPC_NLA_PROP_TOL; else if ((strcmp(cmd->cmd, "window") == 0)) prop = TIPC_NLA_PROP_WIN; + else if ((strcmp(cmd->cmd, "mtu") == 0)) + prop = TIPC_NLA_PROP_MTU; else return -EINVAL; if (parse_opts(opts, cmdl) < 0) return -EINVAL; + if (prop == TIPC_NLA_PROP_MTU) { + char *media = cmd_get_media_type(cmd, cmdl, opts); + + if (!media) + return -EINVAL; + else if (strcmp(media, "udp")) { + fprintf(stderr, "error, not supported for media\n"); + return -EINVAL; + } + } + if (!(nlh = msg_init(buf, TIPC_NL_BEARER_GET))) { fprintf(stderr, "error, message initialisation failed\n"); return -1; @@ -904,6 +940,7 @@ static int cmd_bearer_get(struct nlmsghdr *nlh, const struct cmd *cmd, { "priority", cmd_bearer_get_prop, cmd_bearer_get_help }, { "tolerance", cmd_bearer_get_prop, cmd_bearer_get_help }, { "window", cmd_bearer_get_prop, cmd_bearer_get_help }, + { "mtu", cmd_bearer_get_prop, cmd_bearer_get_help }, { "media", cmd_bearer_get_media, cmd_bearer_get_help }, { NULL } }; diff --git a/tipc/media.c b/tipc/media.c index 6e10c7e5..969ef657 100644 --- a/tipc/media.c +++ b/tipc/media.c @@ -103,6 +103,8 @@ static int cmd_media_get_prop(struct nlmsghdr *nlh, const struct cmd *cmd, prop = TIPC_NLA_PROP_TOL; else if ((strcmp(cmd->cmd, "window") == 0)) prop = TIPC_NLA_PROP_WIN; + else if ((strcmp(cmd->cmd, "mtu") == 0)) + prop = TIPC_NLA_PROP_MTU; else return -EINVAL; @@ -123,6 +125,12 @@ static int cmd_media_get_prop(struct nlmsghdr *nlh, const struct cmd *cmd, fprintf(stderr, "error, missing media\n"); return -EINVAL; } + + if ((prop == TIPC_NLA_PROP_MTU) && + (strcmp(opt->val, "udp"))) { + fprintf(stderr, "error, not supported for media\n"); + return -EINVAL; + } nest = mnl_attr_nest_start(nlh, TIPC_NLA_MEDIA); mnl_attr_put_strz(nlh, TIPC_NLA_MEDIA_NAME, opt->val); mnl_attr_nest_end(nlh, nest); @@ -136,7 +144,8 @@ static void cmd_media_get_help(struct cmdl *cmdl) "PROPERTIES\n" " tolerance - Get media tolerance\n" " priority - Get media priority\n" - " window - Get media window\n", + " window - Get media window\n" + " mtu - Get media mtu\n", cmdl->argv[0]); } @@ -147,6 +156,7 @@ static int cmd_media_get(struct nlmsghdr *nlh, const struct cmd *cmd, { "priority", cmd_media_get_prop, cmd_media_get_help }, { "tolerance", cmd_media_get_prop, cmd_media_get_help }, { "window", cmd_media_get_prop, cmd_media_get_help }, + { "mtu", cmd_media_get_prop, cmd_media_get_help }, { NULL } }; @@ -159,7 +169,8 @@ static void cmd_media_set_help(struct cmdl *cmdl) "PROPERTIES\n" " tolerance TOLERANCE - Set media tolerance\n" " priority PRIORITY - Set media priority\n" - " window WINDOW - Set media window\n", + " window WINDOW - Set media window\n" + " mtu MTU - Set media mtu\n", cmdl->argv[0]); } @@ -183,6 +194,8 @@ static int cmd_media_set_prop(struct nlmsghdr *nlh, const struct cmd *cmd, prop = TIPC_NLA_PROP_TOL; else if ((strcmp(cmd->cmd, "window") == 0)) prop = TIPC_NLA_PROP_WIN; + else if ((strcmp(cmd->cmd, "mtu") == 0)) + prop = TIPC_NLA_PROP_MTU; else return -EINVAL; @@ -210,6 +223,12 @@ static int cmd_media_set_prop(struct nlmsghdr *nlh, const struct cmd *cmd, fprintf(stderr, "error, missing media\n"); return -EINVAL; } + + if ((prop == TIPC_NLA_PROP_MTU) && + (strcmp(opt->val, "udp"))) { + fprintf(stderr, "error, not supported for media\n"); + return -EINVAL; + } mnl_attr_put_strz(nlh, TIPC_NLA_MEDIA_NAME, opt->val); props = mnl_attr_nest_start(nlh, TIPC_NLA_MEDIA_PROP); @@ -228,6 +247,7 @@ static int cmd_media_set(struct nlmsghdr *nlh, const struct cmd *cmd, { "priority", cmd_media_set_prop, cmd_media_set_help }, { "tolerance", cmd_media_set_prop, cmd_media_set_help }, { "window", cmd_media_set_prop, cmd_media_set_help }, + { "mtu", cmd_media_set_prop, cmd_media_set_help }, { NULL } }; From 3f2c23811dc66d396be514f2cf6a1ec99f387373 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 15 May 2018 21:49:55 -0300 Subject: [PATCH 14/28] tc-netem: fix limit description in man page As the kernel code says, limit is actually the amount of packets it can hold queued at a time, as per: static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { ... if (unlikely(sch->q.qlen >= sch->limit)) return qdisc_drop_all(skb, sch, to_free); So lets fix the description of the field in the man page. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David Ahern --- man/man8/tc-netem.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/man8/tc-netem.8 b/man/man8/tc-netem.8 index b31384f5..f2cd86b6 100644 --- a/man/man8/tc-netem.8 +++ b/man/man8/tc-netem.8 @@ -65,7 +65,7 @@ netem has the following options: .SS limit packets -limits the effect of selected options to the indicated number of next packets. +maximum number of packets the qdisc may hold queued at a time. .SS delay adds the chosen delay to the packets outgoing to chosen network interface. The From 4276e652900347d8f14829bccb6ae82881c4d724 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 18 May 2018 09:05:07 -0700 Subject: [PATCH 15/28] Update kernel headers Update kernel headers to commit 64a2658b58ab ("net: mscc: Add SPDX identifier") Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 142 ++++++++++++++++++++++++++++++++++- include/uapi/linux/pkt_cls.h | 1 + 2 files changed, 142 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4da29ae0..c981596c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -96,6 +96,7 @@ enum bpf_cmd { BPF_PROG_QUERY, BPF_RAW_TRACEPOINT_OPEN, BPF_BTF_LOAD, + BPF_BTF_GET_FD_BY_ID, }; enum bpf_map_type { @@ -117,6 +118,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, + BPF_MAP_TYPE_SOCKHASH, }; enum bpf_prog_type { @@ -344,6 +346,7 @@ union bpf_attr { __u32 start_id; __u32 prog_id; __u32 map_id; + __u32 btf_id; }; __u32 next_id; __u32 open_flags; @@ -1826,6 +1829,79 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * Description + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packet is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst, + * ipv6_dst or mpls_out based on family, smac is set to mac + * address of egress device, dmac is set to nexthop mac address, + * rt_metric is set to metric from route. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags: + * + * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs + * full lookup using FIB rules + * **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress + * perspective (default is ingress) + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * + * Return + * Egress device index on success, 0 if packet needs to continue + * up the stack for further processing or a negative error in case + * of failure. + * + * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdeict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1896,7 +1972,11 @@ union bpf_attr { FN(xdp_adjust_tail), \ FN(skb_get_xfrm_state), \ FN(get_stack), \ - FN(skb_load_bytes_relative), + FN(skb_load_bytes_relative), \ + FN(fib_lookup), \ + FN(sock_hash_update), \ + FN(msg_redirect_hash), \ + FN(sk_redirect_hash), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2130,6 +2210,15 @@ struct bpf_map_info { __u32 ifindex; __u64 netns_dev; __u64 netns_ino; + __u32 btf_id; + __u32 btf_key_id; + __u32 btf_value_id; +} __attribute__((aligned(8))); + +struct bpf_btf_info { + __aligned_u64 btf; + __u32 btf_size; + __u32 id; } __attribute__((aligned(8))); /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed @@ -2310,4 +2399,55 @@ struct bpf_raw_tracepoint_args { __u64 args[0]; }; +/* DIRECT: Skip the FIB rules and go to FIB table associated with device + * OUTPUT: Do lookup from egress perspective; default is ingress + */ +#define BPF_FIB_LOOKUP_DIRECT BIT(0) +#define BPF_FIB_LOOKUP_OUTPUT BIT(1) + +struct bpf_fib_lookup { + /* input */ + __u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */ + + /* set if lookup is to consider L4 data - e.g., FIB rules */ + __u8 l4_protocol; + __be16 sport; + __be16 dport; + + /* total length of packet from network header - used for MTU check */ + __u16 tot_len; + __u32 ifindex; /* L3 device index for lookup */ + + union { + /* inputs to lookup */ + __u8 tos; /* AF_INET */ + __be32 flowlabel; /* AF_INET6 */ + + /* output: metric of fib result */ + __u32 rt_metric; + }; + + union { + __be32 mpls_in; + __be32 ipv4_src; + __u32 ipv6_src[4]; /* in6_addr; network order */ + }; + + /* input to bpf_fib_lookup, *dst is destination address. + * output: bpf_fib_lookup sets to gateway address + */ + union { + /* return for MPLS lookups */ + __be32 mpls_out[4]; /* support up to 4 labels */ + __be32 ipv4_dst; + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ +}; + #endif /* __LINUX_BPF_H__ */ diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index be05e66c..84e4c1d0 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -129,6 +129,7 @@ enum { #define TCA_CLS_FLAGS_SKIP_SW (1 << 1) /* don't use filter in SW */ #define TCA_CLS_FLAGS_IN_HW (1 << 2) /* filter is offloaded to HW */ #define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */ +#define TCA_CLS_FLAGS_VERBOSE (1 << 4) /* verbose logging */ /* U32 filters */ From ac6a4c2299a651e4cb54e6313435117ee8a3c228 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Sun, 13 May 2018 17:44:28 -0300 Subject: [PATCH 16/28] tc: flower: add support for verbose logging Currently there is no way to log offloading errors if the rule is not explicitly marked as skip_sw, making it hard for other applications such as Open vSwitch to log why a given could not be offloaded. This patch adds support for signaling the kernel that more verbose logging is wanted, which now will include such messages. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David Ahern --- man/man8/tc-flower.8 | 7 +++++++ tc/f_flower.c | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8 index a561443b..4f3714b7 100644 --- a/man/man8/tc-flower.8 +++ b/man/man8/tc-flower.8 @@ -22,6 +22,8 @@ flower \- flow based traffic control filter .IR MATCH " := { " .B indev .IR ifname " | " +.BR verbose +.RI " | " .BR skip_sw " | " skip_hw .RI " | { " .BR dst_mac " | " src_mac " } " @@ -100,6 +102,11 @@ is the name of an interface which must exist at the time of .B tc invocation. .TP +.BI verbose +Enable verbose logging, including offloading errors when not using +.B skip_sw +flag. +.TP .BI skip_sw Do not process filter by software. If hardware has no offload support for this filter, or TC offload is not enabled for the interface, operation will fail. diff --git a/tc/f_flower.c b/tc/f_flower.c index ba8eb66c..c7107651 100644 --- a/tc/f_flower.c +++ b/tc/f_flower.c @@ -41,7 +41,7 @@ enum flower_icmp_field { static void explain(void) { fprintf(stderr, - "Usage: ... flower [ MATCH-LIST ]\n" + "Usage: ... flower [ MATCH-LIST ] [ verbose ]\n" " [ skip_sw | skip_hw ]\n" " [ action ACTION-SPEC ] [ classid CLASSID ]\n" "\n" @@ -648,6 +648,8 @@ static int flower_parse_opt(struct filter_util *qu, char *handle, fprintf(stderr, "Illegal \"ip_flags\"\n"); return -1; } + } else if (matches(*argv, "verbose") == 0) { + flags |= TCA_CLS_FLAGS_VERBOSE; } else if (matches(*argv, "skip_hw") == 0) { flags |= TCA_CLS_FLAGS_SKIP_HW; } else if (matches(*argv, "skip_sw") == 0) { From 53d34eb66cf1c40b60927bdadb9959353980537e Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Thu, 17 May 2018 09:28:02 -0400 Subject: [PATCH 17/28] tc: add missing space symbol in ife output In order to make TDC tests match the output patterns, the missing space character must be added in the mode output string. Fixes: 8744c5d3388e3 ("tc: jsonify ife action") Signed-off-by: Roman Mashak Signed-off-by: David Ahern --- tc/m_ife.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tc/m_ife.c b/tc/m_ife.c index 5320e94d..20e9c73d 100644 --- a/tc/m_ife.c +++ b/tc/m_ife.c @@ -240,7 +240,7 @@ static int print_ife(struct action_util *au, FILE *f, struct rtattr *arg) p = RTA_DATA(tb[TCA_IFE_PARMS]); print_string(PRINT_ANY, "kind", "%s ", "ife"); - print_string(PRINT_ANY, "mode", "%s", + print_string(PRINT_ANY, "mode", "%s ", p->flags & IFE_ENCODE ? "encode" : "decode"); print_action_control(f, "action ", p->action, " "); From 5947046dd9612fe07577d806d752ed72ee70f0e6 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 17 May 2018 16:02:42 +0200 Subject: [PATCH 18/28] tipc: fixed node and name table listings We make it easier for users to correlate between 128-bit node identities and 32-bit node hash number by extending the 'node list' command to also show the hash number. We also improve the 'nametable show' command to show the node identity instead of the node hash number. Since the former potentially is much longer than the latter, we make room for it by eliminating the (to the user) irrelevant publication key. We also reorder some of the columns so that the node id comes last, since this looks nicer and is more logical. Signed-off-by: David Ahern --- tipc/misc.c | 20 ++++++++++++++++++++ tipc/misc.h | 1 + tipc/nametable.c | 18 ++++++++++-------- tipc/node.c | 19 ++++++++----------- tipc/peer.c | 4 ++++ 5 files changed, 43 insertions(+), 19 deletions(-) diff --git a/tipc/misc.c b/tipc/misc.c index 16849f18..e4b1cd0c 100644 --- a/tipc/misc.c +++ b/tipc/misc.c @@ -13,6 +13,10 @@ #include #include #include +#include +#include +#include +#include #include "misc.h" #define IN_RANGE(val, low, high) ((val) <= (high) && (val) >= (low)) @@ -109,3 +113,19 @@ void nodeid2str(uint8_t *id, char *str) for (i = 31; str[i] == '0'; i--) str[i] = 0; } + +void hash2nodestr(uint32_t hash, char *str) +{ + struct tipc_sioc_nodeid_req nr = {}; + int sd; + + sd = socket(AF_TIPC, SOCK_RDM, 0); + if (sd < 0) { + fprintf(stderr, "opening TIPC socket: %s\n", strerror(errno)); + return; + } + nr.peer = hash; + if (!ioctl(sd, SIOCGETNODEID, &nr)) + nodeid2str((uint8_t *)nr.node_id, str); + close(sd); +} diff --git a/tipc/misc.h b/tipc/misc.h index 6e8afddf..ff2f31f1 100644 --- a/tipc/misc.h +++ b/tipc/misc.h @@ -17,5 +17,6 @@ uint32_t str2addr(char *str); int str2nodeid(char *str, uint8_t *id); void nodeid2str(uint8_t *id, char *str); +void hash2nodestr(uint32_t hash, char *str); #endif diff --git a/tipc/nametable.c b/tipc/nametable.c index 2578940f..ae73dfa5 100644 --- a/tipc/nametable.c +++ b/tipc/nametable.c @@ -20,6 +20,7 @@ #include "cmdl.h" #include "msg.h" #include "nametable.h" +#include "misc.h" #define PORTID_STR_LEN 45 /* Four u32 and five delimiter chars */ @@ -31,6 +32,7 @@ static int nametable_show_cb(const struct nlmsghdr *nlh, void *data) struct nlattr *attrs[TIPC_NLA_NAME_TABLE_MAX + 1] = {}; struct nlattr *publ[TIPC_NLA_PUBL_MAX + 1] = {}; const char *scope[] = { "", "zone", "cluster", "node" }; + char str[33] = {0,}; mnl_attr_parse(nlh, sizeof(*genl), parse_attrs, info); if (!info[TIPC_NLA_NAME_TABLE]) @@ -45,20 +47,20 @@ static int nametable_show_cb(const struct nlmsghdr *nlh, void *data) return MNL_CB_ERROR; if (!*iteration) - printf("%-10s %-10s %-10s %-10s %-10s %-10s\n", - "Type", "Lower", "Upper", "Node", "Port", - "Publication Scope"); + printf("%-10s %-10s %-10s %-8s %-10s %-33s\n", + "Type", "Lower", "Upper", "Scope", "Port", + "Node"); (*iteration)++; - printf("%-10u %-10u %-10u %-10x %-10u %-12u", + hash2nodestr(mnl_attr_get_u32(publ[TIPC_NLA_PUBL_NODE]), str); + + printf("%-10u %-10u %-10u %-8s %-10u %s\n", mnl_attr_get_u32(publ[TIPC_NLA_PUBL_TYPE]), mnl_attr_get_u32(publ[TIPC_NLA_PUBL_LOWER]), mnl_attr_get_u32(publ[TIPC_NLA_PUBL_UPPER]), - mnl_attr_get_u32(publ[TIPC_NLA_PUBL_NODE]), + scope[mnl_attr_get_u32(publ[TIPC_NLA_PUBL_SCOPE])], mnl_attr_get_u32(publ[TIPC_NLA_PUBL_REF]), - mnl_attr_get_u32(publ[TIPC_NLA_PUBL_KEY])); - - printf("%s\n", scope[mnl_attr_get_u32(publ[TIPC_NLA_PUBL_SCOPE])]); + str); return MNL_CB_OK; } diff --git a/tipc/node.c b/tipc/node.c index b73b644c..0fa1064c 100644 --- a/tipc/node.c +++ b/tipc/node.c @@ -26,10 +26,11 @@ static int node_list_cb(const struct nlmsghdr *nlh, void *data) { - uint32_t addr; struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh); struct nlattr *info[TIPC_NLA_MAX + 1] = {}; struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1] = {}; + char str[33] = {}; + uint32_t addr; mnl_attr_parse(nlh, sizeof(*genl), parse_attrs, info); if (!info[TIPC_NLA_NODE]) @@ -40,13 +41,12 @@ static int node_list_cb(const struct nlmsghdr *nlh, void *data) return MNL_CB_ERROR; addr = mnl_attr_get_u32(attrs[TIPC_NLA_NODE_ADDR]); - printf("%x: ", addr); - + hash2nodestr(addr, str); + printf("%-32s %08x ", str, addr); if (attrs[TIPC_NLA_NODE_UP]) printf("up\n"); else printf("down\n"); - return MNL_CB_OK; } @@ -64,7 +64,7 @@ static int cmd_node_list(struct nlmsghdr *nlh, const struct cmd *cmd, fprintf(stderr, "error, message initialisation failed\n"); return -1; } - + printf("Node Identity Hash State\n"); return msg_dumpit(nlh, node_list_cb, NULL); } @@ -120,7 +120,7 @@ static int cmd_node_get_addr(struct nlmsghdr *nlh, const struct cmd *cmd, } close(sk); - printf("%x\n", addr.addr.id.node); + printf("%08x\n", addr.addr.id.node); return 0; } @@ -167,7 +167,6 @@ static int nodeid_get_cb(const struct nlmsghdr *nlh, void *data) uint8_t id[16] = {0,}; uint64_t *w0 = (uint64_t *) &id[0]; uint64_t *w1 = (uint64_t *) &id[8]; - int i; mnl_attr_parse(nlh, sizeof(*genl), parse_attrs, info); if (!info[TIPC_NLA_NET]) @@ -180,10 +179,8 @@ static int nodeid_get_cb(const struct nlmsghdr *nlh, void *data) *w0 = mnl_attr_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = mnl_attr_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); nodeid2str(id, str); - printf("Node Identity Hash\n"); - printf("%s", str); - for (i = strlen(str); i <= 33; i++) - printf(" "); + printf("Node Identity Hash\n"); + printf("%-33s", str); cmd_node_get_addr(NULL, NULL, NULL, NULL); return MNL_CB_OK; } diff --git a/tipc/peer.c b/tipc/peer.c index de0c73c3..f6380777 100644 --- a/tipc/peer.c +++ b/tipc/peer.c @@ -39,7 +39,11 @@ static int cmd_peer_rm_addr(struct nlmsghdr *nlh, const struct cmd *cmd, } str = shift_cmdl(cmdl); + + /* First try legacy Z.C.N format, then integer format */ addr = str2addr(str); + if (!addr) + addr = atoi(str); if (!addr) return -1; From 366d20b91fc6280b747c3fe146dcb5c7e8c95e6e Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 15 May 2018 13:41:02 -0700 Subject: [PATCH 19/28] rdma: update rdma_netlink.h to get new driver attributes Pull in the rdma_netlink.h changes from kernel commits: 25a0ad85156a ("RDMA/nldev: Add explicit pad attribute") da5c85078215 ("RDMA/nldev: add driver-specific resource tracking)" 0d52d803767e ("RDMA/uapi: Fix uapi breakage") Signed-off-by: Steve Wise Signed-off-by: David Ahern --- rdma/include/uapi/rdma/rdma_netlink.h | 30 ++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/rdma/include/uapi/rdma/rdma_netlink.h b/rdma/include/uapi/rdma/rdma_netlink.h index 60416ed7..6513fb89 100644 --- a/rdma/include/uapi/rdma/rdma_netlink.h +++ b/rdma/include/uapi/rdma/rdma_netlink.h @@ -249,10 +249,22 @@ enum rdma_nldev_command { RDMA_NLDEV_NUM_OPS }; +enum { + RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16, +}; + +enum rdma_nldev_print_type { + RDMA_NLDEV_PRINT_TYPE_UNSPEC, + RDMA_NLDEV_PRINT_TYPE_HEX, +}; + enum rdma_nldev_attr { /* don't change the order or add anything between, this is ABI! */ RDMA_NLDEV_ATTR_UNSPEC, + /* Pad attribute for 64b alignment */ + RDMA_NLDEV_ATTR_PAD = RDMA_NLDEV_ATTR_UNSPEC, + /* Identifier for ib_device */ RDMA_NLDEV_ATTR_DEV_INDEX, /* u32 */ @@ -387,7 +399,6 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_PD_ENTRY, /* nested table */ RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */ RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */ - /* * Provides logical name and index of netdevice which is * connected to physical port. This information is relevant @@ -400,7 +411,24 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_NDEV_INDEX, /* u32 */ RDMA_NLDEV_ATTR_NDEV_NAME, /* string */ + /* + * driver-specific attributes. + */ + RDMA_NLDEV_ATTR_DRIVER, /* nested table */ + RDMA_NLDEV_ATTR_DRIVER_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_DRIVER_STRING, /* string */ + /* + * u8 values from enum rdma_nldev_print_type + */ + RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE, /* u8 */ + RDMA_NLDEV_ATTR_DRIVER_S32, /* s32 */ + RDMA_NLDEV_ATTR_DRIVER_U32, /* u32 */ + RDMA_NLDEV_ATTR_DRIVER_S64, /* s64 */ + RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */ + /* + * Always the end + */ RDMA_NLDEV_ATTR_MAX }; #endif /* _RDMA_NETLINK_H */ From 331152752a97dc33ad1222869712c0cc49f26068 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 15 May 2018 13:41:09 -0700 Subject: [PATCH 20/28] rdma: print driver resource attributes This enhancement allows printing rdma device-specific state, if provided by the kernel. This is done in a generic manner, so rdma tool doesn't need to know about the details of every type of rdma device. Driver attributes for a rdma resource are in the form of tuples, where the key is a string and the value can be any supported driver attribute. The print_type attribute, if present, provides a print format to use vs the standard print format for the type. For example, the default print type for a PROVIDER_S32 value is "%d ", but "0x%x " if the print_type of PRINT_TYPE_HEX is included inthe tuple. Driver resources are only printed when the -dd flag is present. If -p is present, then the output is formatted to not exceed 80 columns, otherwise it is printed as a single row to be grep/awk friendly. Example output: # rdma resource show qp lqpn 1028 -dd -p link cxgb4_0/- lqpn 1028 rqpn 0 type RC state RTS rq-psn 0 sq-psn 0 path-mig-state MIGRATED pid 0 comm [nvme_rdma] sqid 1028 flushed 0 memsize 123968 cidx 85 pidx 85 wq_pidx 106 flush_cidx 85 in_use 0 size 386 flags 0x0 rqid 1029 memsize 16768 cidx 43 pidx 41 wq_pidx 171 msn 44 rqt_hwaddr 0x2a8a5d00 rqt_size 256 in_use 128 size 130 Signed-off-by: Steve Wise Signed-off-by: David Ahern --- rdma/rdma.c | 9 ++- rdma/rdma.h | 11 +++ rdma/res.c | 30 +++----- rdma/utils.c | 196 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 224 insertions(+), 22 deletions(-) diff --git a/rdma/rdma.c b/rdma/rdma.c index b43e5385..010e9837 100644 --- a/rdma/rdma.c +++ b/rdma/rdma.c @@ -129,13 +129,14 @@ int main(int argc, char **argv) { "batch", required_argument, NULL, 'b' }, { NULL, 0, NULL, 0 } }; + bool show_driver_details = false; const char *batch_file = NULL; bool pretty_output = false; bool show_details = false; bool json_output = false; bool force = false; - char *filename; struct rd rd = {}; + char *filename; int opt; int err; @@ -152,7 +153,10 @@ int main(int argc, char **argv) pretty_output = true; break; case 'd': - show_details = true; + if (show_details) + show_driver_details = true; + else + show_details = true; break; case 'j': json_output = true; @@ -180,6 +184,7 @@ int main(int argc, char **argv) argv += optind; rd.show_details = show_details; + rd.show_driver_details = show_driver_details; rd.json_output = json_output; rd.pretty_output = pretty_output; diff --git a/rdma/rdma.h b/rdma/rdma.h index 1908fc4e..fcaf9e69 100644 --- a/rdma/rdma.h +++ b/rdma/rdma.h @@ -55,6 +55,7 @@ struct rd { char **argv; char *filename; bool show_details; + bool show_driver_details; struct list_head dev_map_list; uint32_t dev_idx; uint32_t port_idx; @@ -115,4 +116,14 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t seq); void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t flags); int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data); int rd_attr_cb(const struct nlattr *attr, void *data); +int rd_attr_check(const struct nlattr *attr, int *typep); + +/* + * Print helpers + */ +void print_driver_table(struct rd *rd, struct nlattr *tb); +void newline(struct rd *rd); +void newline_indent(struct rd *rd); +#define MAX_LINE_LENGTH 80 + #endif /* _RDMA_TOOL_H_ */ diff --git a/rdma/res.c b/rdma/res.c index 1a0aab60..074b9929 100644 --- a/rdma/res.c +++ b/rdma/res.c @@ -439,10 +439,8 @@ static int res_qp_parse_cb(const struct nlmsghdr *nlh, void *data) if (nla_line[RDMA_NLDEV_ATTR_RES_PID]) free(comm); - if (rd->json_output) - jsonw_end_array(rd->jw); - else - pr_out("\n"); + print_driver_table(rd, nla_line[RDMA_NLDEV_ATTR_DRIVER]); + newline(rd); } return MNL_CB_OK; } @@ -678,10 +676,8 @@ static int res_cm_id_parse_cb(const struct nlmsghdr *nlh, void *data) if (nla_line[RDMA_NLDEV_ATTR_RES_PID]) free(comm); - if (rd->json_output) - jsonw_end_array(rd->jw); - else - pr_out("\n"); + print_driver_table(rd, nla_line[RDMA_NLDEV_ATTR_DRIVER]); + newline(rd); } return MNL_CB_OK; } @@ -804,10 +800,8 @@ static int res_cq_parse_cb(const struct nlmsghdr *nlh, void *data) if (nla_line[RDMA_NLDEV_ATTR_RES_PID]) free(comm); - if (rd->json_output) - jsonw_end_array(rd->jw); - else - pr_out("\n"); + print_driver_table(rd, nla_line[RDMA_NLDEV_ATTR_DRIVER]); + newline(rd); } return MNL_CB_OK; } @@ -919,10 +913,8 @@ static int res_mr_parse_cb(const struct nlmsghdr *nlh, void *data) if (nla_line[RDMA_NLDEV_ATTR_RES_PID]) free(comm); - if (rd->json_output) - jsonw_end_array(rd->jw); - else - pr_out("\n"); + print_driver_table(rd, nla_line[RDMA_NLDEV_ATTR_DRIVER]); + newline(rd); } return MNL_CB_OK; } @@ -1004,10 +996,8 @@ static int res_pd_parse_cb(const struct nlmsghdr *nlh, void *data) if (nla_line[RDMA_NLDEV_ATTR_RES_PID]) free(comm); - if (rd->json_output) - jsonw_end_array(rd->jw); - else - pr_out("\n"); + print_driver_table(rd, nla_line[RDMA_NLDEV_ATTR_DRIVER]); + newline(rd); } return MNL_CB_OK; } diff --git a/rdma/utils.c b/rdma/utils.c index 49c967f3..4840bf22 100644 --- a/rdma/utils.c +++ b/rdma/utils.c @@ -11,6 +11,7 @@ #include "rdma.h" #include +#include int rd_argc(struct rd *rd) { @@ -393,8 +394,32 @@ static const enum mnl_attr_data_type nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_MRLEN] = MNL_TYPE_U64, [RDMA_NLDEV_ATTR_NDEV_INDEX] = MNL_TYPE_U32, [RDMA_NLDEV_ATTR_NDEV_NAME] = MNL_TYPE_NUL_STRING, + [RDMA_NLDEV_ATTR_DRIVER] = MNL_TYPE_NESTED, + [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = MNL_TYPE_NESTED, + [RDMA_NLDEV_ATTR_DRIVER_STRING] = MNL_TYPE_NUL_STRING, + [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = MNL_TYPE_U8, + [RDMA_NLDEV_ATTR_DRIVER_S32] = MNL_TYPE_U32, + [RDMA_NLDEV_ATTR_DRIVER_U32] = MNL_TYPE_U32, + [RDMA_NLDEV_ATTR_DRIVER_S64] = MNL_TYPE_U64, + [RDMA_NLDEV_ATTR_DRIVER_U64] = MNL_TYPE_U64, }; +int rd_attr_check(const struct nlattr *attr, int *typep) +{ + int type; + + if (mnl_attr_type_valid(attr, RDMA_NLDEV_ATTR_MAX) < 0) + return MNL_CB_ERROR; + + type = mnl_attr_get_type(attr); + + if (mnl_attr_validate(attr, nldev_policy[type]) < 0) + return MNL_CB_ERROR; + + *typep = nldev_policy[type]; + return MNL_CB_OK; +} + int rd_attr_cb(const struct nlattr *attr, void *data) { const struct nlattr **tb = data; @@ -660,3 +685,174 @@ struct dev_map *dev_map_lookup(struct rd *rd, bool allow_port_index) free(dev_name); return dev_map; } + +#define nla_type(attr) ((attr)->nla_type & NLA_TYPE_MASK) + +void newline(struct rd *rd) +{ + if (rd->json_output) + jsonw_end_array(rd->jw); + else + pr_out("\n"); +} + +void newline_indent(struct rd *rd) +{ + newline(rd); + if (!rd->json_output) + pr_out(" "); +} + +static int print_driver_string(struct rd *rd, const char *key_str, + const char *val_str) +{ + if (rd->json_output) { + jsonw_string_field(rd->jw, key_str, val_str); + return 0; + } else { + return pr_out("%s %s ", key_str, val_str); + } +} + +static int print_driver_s32(struct rd *rd, const char *key_str, int32_t val, + enum rdma_nldev_print_type print_type) +{ + if (rd->json_output) { + jsonw_int_field(rd->jw, key_str, val); + return 0; + } + switch (print_type) { + case RDMA_NLDEV_PRINT_TYPE_UNSPEC: + return pr_out("%s %d ", key_str, val); + case RDMA_NLDEV_PRINT_TYPE_HEX: + return pr_out("%s 0x%x ", key_str, val); + default: + return -EINVAL; + } +} + +static int print_driver_u32(struct rd *rd, const char *key_str, uint32_t val, + enum rdma_nldev_print_type print_type) +{ + if (rd->json_output) { + jsonw_int_field(rd->jw, key_str, val); + return 0; + } + switch (print_type) { + case RDMA_NLDEV_PRINT_TYPE_UNSPEC: + return pr_out("%s %u ", key_str, val); + case RDMA_NLDEV_PRINT_TYPE_HEX: + return pr_out("%s 0x%x ", key_str, val); + default: + return -EINVAL; + } +} + +static int print_driver_s64(struct rd *rd, const char *key_str, int64_t val, + enum rdma_nldev_print_type print_type) +{ + if (rd->json_output) { + jsonw_int_field(rd->jw, key_str, val); + return 0; + } + switch (print_type) { + case RDMA_NLDEV_PRINT_TYPE_UNSPEC: + return pr_out("%s %" PRId64 " ", key_str, val); + case RDMA_NLDEV_PRINT_TYPE_HEX: + return pr_out("%s 0x%" PRIx64 " ", key_str, val); + default: + return -EINVAL; + } +} + +static int print_driver_u64(struct rd *rd, const char *key_str, uint64_t val, + enum rdma_nldev_print_type print_type) +{ + if (rd->json_output) { + jsonw_int_field(rd->jw, key_str, val); + return 0; + } + switch (print_type) { + case RDMA_NLDEV_PRINT_TYPE_UNSPEC: + return pr_out("%s %" PRIu64 " ", key_str, val); + case RDMA_NLDEV_PRINT_TYPE_HEX: + return pr_out("%s 0x%" PRIx64 " ", key_str, val); + default: + return -EINVAL; + } +} + +static int print_driver_entry(struct rd *rd, struct nlattr *key_attr, + struct nlattr *val_attr, + enum rdma_nldev_print_type print_type) +{ + const char *key_str = mnl_attr_get_str(key_attr); + int attr_type = nla_type(val_attr); + + switch (attr_type) { + case RDMA_NLDEV_ATTR_DRIVER_STRING: + return print_driver_string(rd, key_str, + mnl_attr_get_str(val_attr)); + case RDMA_NLDEV_ATTR_DRIVER_S32: + return print_driver_s32(rd, key_str, + mnl_attr_get_u32(val_attr), print_type); + case RDMA_NLDEV_ATTR_DRIVER_U32: + return print_driver_u32(rd, key_str, + mnl_attr_get_u32(val_attr), print_type); + case RDMA_NLDEV_ATTR_DRIVER_S64: + return print_driver_s64(rd, key_str, + mnl_attr_get_u64(val_attr), print_type); + case RDMA_NLDEV_ATTR_DRIVER_U64: + return print_driver_u64(rd, key_str, + mnl_attr_get_u64(val_attr), print_type); + } + return -EINVAL; +} + +void print_driver_table(struct rd *rd, struct nlattr *tb) +{ + int print_type = RDMA_NLDEV_PRINT_TYPE_UNSPEC; + struct nlattr *tb_entry, *key = NULL, *val; + int type, cc = 0; + int ret; + + if (!rd->show_driver_details || !tb) + return; + + if (rd->pretty_output) + newline_indent(rd); + + /* + * Driver attrs are tuples of {key, [print-type], value}. + * The key must be a string. If print-type is present, it + * defines an alternate printf format type vs the native format + * for the attribute. And the value can be any available + * driver type. + */ + mnl_attr_for_each_nested(tb_entry, tb) { + + if (cc > MAX_LINE_LENGTH) { + if (rd->pretty_output) + newline_indent(rd); + cc = 0; + } + if (rd_attr_check(tb_entry, &type) != MNL_CB_OK) + return; + if (!key) { + if (type != MNL_TYPE_NUL_STRING) + return; + key = tb_entry; + } else if (type == MNL_TYPE_U8) { + print_type = mnl_attr_get_u8(tb_entry); + } else { + val = tb_entry; + ret = print_driver_entry(rd, key, val, print_type); + if (ret < 0) + return; + cc += ret; + print_type = RDMA_NLDEV_PRINT_TYPE_UNSPEC; + key = NULL; + } + } + return; +} From 853d222d789c4428ab6f763bf96070194ad11ec0 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 15 May 2018 13:41:15 -0700 Subject: [PATCH 21/28] rdma: update man pages Update the man pages for the resource attributes as well as the driver-specific attributes. Signed-off-by: Steve Wise Signed-off-by: David Ahern --- man/man8/rdma-resource.8 | 29 ++++++++++++++++++++++++++--- man/man8/rdma.8 | 2 +- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/man/man8/rdma-resource.8 b/man/man8/rdma-resource.8 index ff5d25d7..40b073db 100644 --- a/man/man8/rdma-resource.8 +++ b/man/man8/rdma-resource.8 @@ -7,12 +7,15 @@ rdma-resource \- rdma resource configuration .in +8 .ti -8 .B rdma -.RI "[ " OPTIONS " ]" -.B resource -.RI " { " COMMAND " | " +.RI "[ " OPTIONS " ] " RESOURCE " { " COMMAND " | " .BR help " }" .sp +.ti -8 +.IR RESOURCE " := { " +.BR cm_id " | " cq " | " mr " | " pd " | " qp " }" +.sp + .ti -8 .IR OPTIONS " := { " \fB\-j\fR[\fIson\fR] | @@ -70,11 +73,31 @@ rdma res show qp link mlx5_4/- -d Detailed view. .RE .PP +rdma res show qp link mlx5_4/- -dd +.RS 4 +Detailed view including driver-specific details. +.RE +.PP rdma res show qp link mlx5_4/1 lqpn 0-6 .RS 4 Limit to specific Local QPNs. .RE .PP +rdma resource show cm_id dst-port 7174 +.RS 4 +Show CM_IDs with destination ip port of 7174. +.RE +.PP +rdma resource show cm_id src-addr 172.16.0.100 +.RS 4 +Show CM_IDs bound to local ip address 172.16.0.100 +.RE +.PP +rdma resource show cq pid 30489 +.RS 4 +Show CQs belonging to pid 30489 +.RE +.PP .SH SEE ALSO .BR rdma (8), diff --git a/man/man8/rdma.8 b/man/man8/rdma.8 index 6f88f377..12aa149b 100644 --- a/man/man8/rdma.8 +++ b/man/man8/rdma.8 @@ -49,7 +49,7 @@ If there were any errors during execution of the commands, the application retur .TP .BR "\-d" , " --details" -Output detailed information. +Output detailed information. Adding a second \-d includes driver-specific details. .TP .BR "\-p" , " --pretty" From c2a569e63c63afd59742b05bb8099e4a8fe44cbf Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 23 May 2018 12:58:34 -0700 Subject: [PATCH 22/28] Update kernel headers Update kernel headers to commit e89e59c08d1b ("Merge branch 'net-sfp-small-improvements'") Signed-off-by: David Ahern --- include/uapi/linux/devlink.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 9f17286e..493f71fe 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -132,6 +132,16 @@ enum devlink_eswitch_encap_mode { DEVLINK_ESWITCH_ENCAP_MODE_BASIC, }; +enum devlink_port_flavour { + DEVLINK_PORT_FLAVOUR_PHYSICAL, /* Any kind of a port physically + * facing the user. + */ + DEVLINK_PORT_FLAVOUR_CPU, /* CPU port */ + DEVLINK_PORT_FLAVOUR_DSA, /* Distributed switch architecture + * interconnect port. + */ +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -224,6 +234,10 @@ enum devlink_attr { DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, /* u64 */ DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */ + DEVLINK_ATTR_PORT_FLAVOUR, /* u16 */ + DEVLINK_ATTR_PORT_NUMBER, /* u32 */ + DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, From 852ed60528d5f08a0933298f4dffc0efb0265b56 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 20 May 2018 10:15:38 +0200 Subject: [PATCH 23/28] devlink: introduce support for showing port flavours Signed-off-by: Jiri Pirko Signed-off-by: David Ahern --- devlink/devlink.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/devlink/devlink.c b/devlink/devlink.c index fa33684c..df2c66da 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -1693,6 +1693,20 @@ static const char *port_type_name(uint32_t type) } } +static const char *port_flavour_name(uint16_t flavour) +{ + switch (flavour) { + case DEVLINK_PORT_FLAVOUR_PHYSICAL: + return "physical"; + case DEVLINK_PORT_FLAVOUR_CPU: + return "cpu"; + case DEVLINK_PORT_FLAVOUR_DSA: + return "dsa"; + default: + return ""; + } +} + static void pr_out_port(struct dl *dl, struct nlattr **tb) { struct nlattr *pt_attr = tb[DEVLINK_ATTR_PORT_TYPE]; @@ -1717,6 +1731,12 @@ static void pr_out_port(struct dl *dl, struct nlattr **tb) if (tb[DEVLINK_ATTR_PORT_IBDEV_NAME]) pr_out_str(dl, "ibdev", mnl_attr_get_str(tb[DEVLINK_ATTR_PORT_IBDEV_NAME])); + if (tb[DEVLINK_ATTR_PORT_FLAVOUR]) { + uint16_t port_flavour = + mnl_attr_get_u16(tb[DEVLINK_ATTR_PORT_FLAVOUR]); + + pr_out_str(dl, "flavour", port_flavour_name(port_flavour)); + } if (tb[DEVLINK_ATTR_PORT_SPLIT_GROUP]) pr_out_uint(dl, "split_group", mnl_attr_get_u32(tb[DEVLINK_ATTR_PORT_SPLIT_GROUP])); From 57ac202c78bb64d9e020158e786578928f5d2819 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 30 May 2018 08:06:19 -0700 Subject: [PATCH 24/28] Update kernel headers Update kernel headers to commit ae40832e53c3 ("bpfilter: fix a build err") Signed-off-by: David Ahern --- include/uapi/linux/bpf.h | 143 ++++++++++++++++++++++++++++++-- include/uapi/linux/if_addr.h | 1 + include/uapi/linux/if_link.h | 1 + include/uapi/linux/rtnetlink.h | 3 + include/uapi/linux/seg6_local.h | 12 +++ 5 files changed, 155 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c981596c..3b884100 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -97,6 +97,7 @@ enum bpf_cmd { BPF_RAW_TRACEPOINT_OPEN, BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, + BPF_TASK_FD_QUERY, }; enum bpf_map_type { @@ -141,6 +142,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_PROG_TYPE_LWT_SEG6LOCAL, }; enum bpf_attach_type { @@ -284,8 +286,8 @@ union bpf_attr { char map_name[BPF_OBJ_NAME_LEN]; __u32 map_ifindex; /* ifindex of netdev to create on */ __u32 btf_fd; /* fd pointing to a BTF type data */ - __u32 btf_key_id; /* BTF type_id of the key */ - __u32 btf_value_id; /* BTF type_id of the value */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -379,6 +381,22 @@ union bpf_attr { __u32 btf_log_size; __u32 btf_log_level; }; + + struct { + __u32 pid; /* input: pid */ + __u32 fd; /* input: fd */ + __u32 flags; /* input: flags */ + __u32 buf_len; /* input/output: buf len */ + __aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __u32 prog_id; /* output: prod_id */ + __u32 fd_type; /* output: BPF_FD_TYPE_* */ + __u64 probe_offset; /* output: probe_offset */ + __u64 probe_addr; /* output: probe_addr */ + } task_fd_query; } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF @@ -1902,6 +1920,90 @@ union bpf_attr { * egress otherwise). This is the only flag supported for now. * Return * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * Description + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * Description + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * Description + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of param: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of param: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1976,7 +2078,11 @@ union bpf_attr { FN(fib_lookup), \ FN(sock_hash_update), \ FN(msg_redirect_hash), \ - FN(sk_redirect_hash), + FN(sk_redirect_hash), \ + FN(lwt_push_encap), \ + FN(lwt_seg6_store_bytes), \ + FN(lwt_seg6_adjust_srh), \ + FN(lwt_seg6_action), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2043,6 +2149,12 @@ enum bpf_hdr_start_off { BPF_HDR_START_NET, }; +/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ +enum bpf_lwt_encap_mode { + BPF_LWT_ENCAP_SEG6, + BPF_LWT_ENCAP_SEG6_INLINE +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -2176,6 +2288,14 @@ enum sk_action { struct sk_msg_md { void *data; void *data_end; + + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ }; #define BPF_TAG_SIZE 8 @@ -2197,6 +2317,10 @@ struct bpf_prog_info { __u32 gpl_compatible:1; __u64 netns_dev; __u64 netns_ino; + __u32 nr_jited_ksyms; + __u32 nr_jited_func_lens; + __aligned_u64 jited_ksyms; + __aligned_u64 jited_func_lens; } __attribute__((aligned(8))); struct bpf_map_info { @@ -2211,8 +2335,8 @@ struct bpf_map_info { __u64 netns_dev; __u64 netns_ino; __u32 btf_id; - __u32 btf_key_id; - __u32 btf_value_id; + __u32 btf_key_type_id; + __u32 btf_value_type_id; } __attribute__((aligned(8))); struct bpf_btf_info { @@ -2450,4 +2574,13 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +enum bpf_task_fd_type { + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_UPROBE, /* filename + offset */ + BPF_FD_TYPE_URETPROBE, /* filename + offset */ +}; + #endif /* __LINUX_BPF_H__ */ diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index c4899e22..a924606f 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -33,6 +33,7 @@ enum { IFA_CACHEINFO, IFA_MULTICAST, IFA_FLAGS, + IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */ __IFA_MAX, }; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 9318ecd0..4eccc7ff 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -331,6 +331,7 @@ enum { IFLA_BRPORT_BCAST_FLOOD, IFLA_BRPORT_GROUP_FWD_MASK, IFLA_BRPORT_NEIGH_SUPPRESS, + IFLA_BRPORT_ISOLATED, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index aba8b18e..a501bc79 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -327,6 +327,9 @@ enum rtattr_type_t { RTA_PAD, RTA_UID, RTA_TTL_PROPAGATE, + RTA_IP_PROTO, + RTA_SPORT, + RTA_DPORT, __RTA_MAX }; diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h index 76b90d60..5312de80 100644 --- a/include/uapi/linux/seg6_local.h +++ b/include/uapi/linux/seg6_local.h @@ -25,6 +25,7 @@ enum { SEG6_LOCAL_NH6, SEG6_LOCAL_IIF, SEG6_LOCAL_OIF, + SEG6_LOCAL_BPF, __SEG6_LOCAL_MAX, }; #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1) @@ -59,10 +60,21 @@ enum { SEG6_LOCAL_ACTION_END_AS = 13, /* forward to SR-unaware VNF with masquerading */ SEG6_LOCAL_ACTION_END_AM = 14, + /* custom BPF action */ + SEG6_LOCAL_ACTION_END_BPF = 15, __SEG6_LOCAL_ACTION_MAX, }; #define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1) +enum { + SEG6_LOCAL_BPF_PROG_UNSPEC, + SEG6_LOCAL_BPF_PROG, + SEG6_LOCAL_BPF_PROG_NAME, + __SEG6_LOCAL_BPF_PROG_MAX, +}; + +#define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1) + #endif From 78d04c7b27cf42b77ed239e5ce26f2b7f75ee57b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 27 May 2018 08:10:00 -0700 Subject: [PATCH 25/28] ipaddress: Add support for address metric Add support for IFA_RT_PRIORITY using the same keywords as iproute for RTA_PRIORITY. Signed-off-by: David Ahern --- ip/ipaddress.c | 15 ++++++++++++++- man/man8/ip-address.8.in | 6 ++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 75539e05..6b53b753 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -63,7 +63,7 @@ static void usage(void) fprintf(stderr, " ip address {showdump|restore}\n"); fprintf(stderr, "IFADDR := PREFIX | ADDR peer PREFIX\n"); fprintf(stderr, " [ broadcast ADDR ] [ anycast ADDR ]\n"); - fprintf(stderr, " [ label IFNAME ] [ scope SCOPE-ID ]\n"); + fprintf(stderr, " [ label IFNAME ] [ scope SCOPE-ID ] [ metric METRIC ]\n"); fprintf(stderr, "SCOPE-ID := [ host | link | global | NUMBER ]\n"); fprintf(stderr, "FLAG-LIST := [ FLAG-LIST ] FLAG\n"); fprintf(stderr, "FLAG := [ permanent | dynamic | secondary | primary |\n"); @@ -1328,6 +1328,10 @@ int print_addrinfo(const struct sockaddr_nl *who, struct nlmsghdr *n, rta_tb[IFA_ADDRESS])); } print_int(PRINT_ANY, "prefixlen", "/%d ", ifa->ifa_prefixlen); + + if (rta_tb[IFA_RT_PRIORITY]) + print_uint(PRINT_ANY, "metric", "metric %u ", + rta_getattr_u32(rta_tb[IFA_RT_PRIORITY])); } if (brief) @@ -2119,6 +2123,15 @@ static int ipaddr_modify(int cmd, int flags, int argc, char **argv) NEXT_ARG(); l = *argv; addattr_l(&req.n, sizeof(req), IFA_LABEL, l, strlen(l)+1); + } else if (matches(*argv, "metric") == 0 || + matches(*argv, "priority") == 0 || + matches(*argv, "preference") == 0) { + __u32 metric; + + NEXT_ARG(); + if (get_u32(&metric, *argv, 0)) + invarg("\"metric\" value is invalid\n", *argv); + addattr32(&req.n, sizeof(req), IFA_RT_PRIORITY, metric); } else if (matches(*argv, "valid_lft") == 0) { if (valid_lftp) duparg("valid_lft", *argv); diff --git a/man/man8/ip-address.8.in b/man/man8/ip-address.8.in index 7ebf0bc9..c3861b37 100644 --- a/man/man8/ip-address.8.in +++ b/man/man8/ip-address.8.in @@ -27,6 +27,8 @@ ip-address \- protocol address management .IR IFNAME " ] [ " .B scope .IR SCOPE-ID " ] [ " +.B metric +.IR METRIC " ] [ " .B to .IR PREFIX " ] [ " FLAG-LIST " ] [ " .B label @@ -214,6 +216,10 @@ valid inside this site. - the address is valid only inside this host. .in -8 +.TP +.BI metric " NUMBER" +priority of prefix route associated with address. + .TP .BI valid_lft " LFT" the valid lifetime of this address; see section 5.5.4 of From 9107c425ace2bcc26e964d9f0cc629a8cdf4674d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 30 May 2018 08:30:09 -0700 Subject: [PATCH 26/28] ip route: print RTA_CACHEINFO if it exists RTA_CACHEINFO can be sent for non-cloned routes. If the attribute is present print it. Allows route dumps to print expires times for example which can exist on FIB entries. Signed-off-by: David Ahern --- ip/iproute.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/ip/iproute.c b/ip/iproute.c index cbc43e2b..78c8085b 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -899,17 +899,14 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) rta_getattr_u32(tb[RTA_UID])); if (r->rtm_family == AF_INET) { - if (r->rtm_flags & RTM_F_CLONED) { + if (r->rtm_flags & RTM_F_CLONED) print_cache_flags(fp, r->rtm_flags); - if (tb[RTA_CACHEINFO]) - print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO])); - } + if (tb[RTA_CACHEINFO]) + print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO])); } else if (r->rtm_family == AF_INET6) { - if (r->rtm_flags & RTM_F_CLONED) { - if (tb[RTA_CACHEINFO]) - print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO])); - } + if (tb[RTA_CACHEINFO]) + print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO])); } if (tb[RTA_METRICS]) From 804c7fff76b981ac2a2b19690a0a910a9926de5c Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 30 May 2018 10:06:18 -0700 Subject: [PATCH 27/28] iproute: ip route get support for sport, dport and ipproto match Signed-off-by: Roopa Prabhu Signed-off-by: David Ahern --- ip/iproute.c | 26 +++++++++++++++++++++++++- man/man8/ip-route.8.in | 20 +++++++++++++++++++- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/ip/iproute.c b/ip/iproute.c index 78c8085b..30833414 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -69,7 +69,8 @@ static void usage(void) " [ from ADDRESS iif STRING ]\n" " [ oif STRING ] [ tos TOS ]\n" " [ mark NUMBER ] [ vrf NAME ]\n" - " [ uid NUMBER ]\n" + " [ uid NUMBER ] [ ipproto PROTOCOL ]\n" + " [ sport NUMBER ] [ dport NUMBER ]\n" " ip route { add | del | change | append | replace } ROUTE\n" "SELECTOR := [ root PREFIX ] [ match PREFIX ] [ exact PREFIX ]\n" " [ table TABLE_ID ] [ vrf NAME ] [ proto RTPROTO ]\n" @@ -1991,6 +1992,29 @@ static int iproute_get(int argc, char **argv) req.r.rtm_family = addr.family; addattr_l(&req.n, sizeof(req), RTA_NEWDST, &addr.data, addr.bytelen); + } else if (matches(*argv, "sport") == 0) { + __be16 sport; + + NEXT_ARG(); + if (get_be16(&sport, *argv, 0)) + invarg("invalid sport\n", *argv); + addattr16(&req.n, sizeof(req), RTA_SPORT, sport); + } else if (matches(*argv, "dport") == 0) { + __be16 dport; + + NEXT_ARG(); + if (get_be16(&dport, *argv, 0)) + invarg("invalid dport\n", *argv); + addattr16(&req.n, sizeof(req), RTA_DPORT, dport); + } else if (matches(*argv, "ipproto") == 0) { + int ipproto; + + NEXT_ARG(); + ipproto = inet_proto_a2n(*argv); + if (ipproto < 0) + invarg("Invalid \"ipproto\" value\n", + *argv); + addattr8(&req.n, sizeof(req), RTA_IP_PROTO, ipproto); } else { inet_prefix addr; diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in index b28f3d2c..b21a8472 100644 --- a/man/man8/ip-route.8.in +++ b/man/man8/ip-route.8.in @@ -38,7 +38,13 @@ ip-route \- routing table management .B tos .IR TOS " ] [ " .B vrf -.IR NAME " ] " +.IR NAME " ] [ " +.B ipproto +.IR PROTOCOL " ] [ " +.B sport +.IR NUMBER " ] [ " +.B dport +.IR NUMBER " ] " .ti -8 .BR "ip route" " { " add " | " del " | " change " | " append " | "\ @@ -1044,6 +1050,18 @@ the firewall mark .BI vrf " NAME" force the vrf device on which this packet will be routed. +.TP +.BI ipproto " PROTOCOL" +ip protocol as seen by the route lookup + +.TP +.BI sport " NUMBER" +source port as seen by the route lookup + +.TP +.BI dport " NUMBER" +destination port as seen by the route lookup + .TP .B connected if no source address From 831b5d40d91631b944e348fa2c5f0f2c46ffaccf Mon Sep 17 00:00:00 2001 From: Keara Leibovitz Date: Tue, 5 Jun 2018 16:44:19 -0400 Subject: [PATCH 28/28] tc: add json support in csum action Add json output support for checksum action. Example output: ~$ $TC actions add action csum udp continue index 7 ~$ $TC actions add action csum icmp iph igmp pipe index 200 cookie 112233 ~$ $TC -j actions ls action csum [{ "total acts":2 }, { "actions": [{ "order":0, "csum":"udp", "control_action": { "type":"continue" }, "index":7, "ref":1, "bind":0 }, { "order":1, "csum":"iph, icmp, igmp", "control_action": { "type":"pipe" }, "index":200, "ref":1, "bind":0, "cookie":"112233" }] }] v2: Don't initialized char buf[64]; Add output example Signed-off-by: Keara Leibovitz Signed-off-by: David Ahern --- tc/m_csum.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tc/m_csum.c b/tc/m_csum.c index 8391071d..752269d1 100644 --- a/tc/m_csum.c +++ b/tc/m_csum.c @@ -162,6 +162,7 @@ print_csum(struct action_util *au, FILE *f, struct rtattr *arg) char *uflag_5 = ""; char *uflag_6 = ""; char *uflag_7 = ""; + SPRINT_BUF(buf); int uflag_count = 0; @@ -198,12 +199,15 @@ print_csum(struct action_util *au, FILE *f, struct rtattr *arg) uflag_1 = "?empty"; } - fprintf(f, "csum (%s%s%s%s%s%s%s) ", - uflag_1, uflag_2, uflag_3, - uflag_4, uflag_5, uflag_6, uflag_7); + snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s", + uflag_1, uflag_2, uflag_3, + uflag_4, uflag_5, uflag_6, uflag_7); + print_string(PRINT_ANY, "csum", "csum (%s) ", buf); + print_action_control(f, "action ", sel->action, "\n"); - fprintf(f, "\tindex %u ref %d bind %d", sel->index, sel->refcnt, - sel->bindcnt); + print_uint(PRINT_ANY, "index", "\tindex %u", sel->index); + print_int(PRINT_ANY, "ref", " ref %d", sel->refcnt); + print_int(PRINT_ANY, "bind", " bind %d", sel->bindcnt); if (show_stats) { if (tb[TCA_CSUM_TM]) { @@ -212,7 +216,7 @@ print_csum(struct action_util *au, FILE *f, struct rtattr *arg) print_tm(f, tm); } } - fprintf(f, "\n"); + print_string(PRINT_FP, NULL, "%s", "\n"); return 0; }