Compare commits

..

5 Commits

Author SHA1 Message Date
Stephen Hemminger 5e155b73f7 v4.1.1 2015-07-06 14:57:34 -07:00
Roopa Prabhu e278010416 mpls: always set type RTN_UNICAST and scope RT_SCOPE_UNIVERSE for
This patch fixes incorrect -EINVAL errors due to invalid
scope and type during mpls route deletes.

$ip -f mpls route add 100 as 200 via inet 10.1.1.2 dev swp1

$ip -f mpls route show
100 as to 200 via inet 10.1.1.2 dev swp1

$ip -f mpls route del 100 as 200 via inet 10.1.1.2 dev swp1
RTNETLINK answers: Invalid argument

$ip -f mpls route del 100
RTNETLINK answers: Invalid argument

After patch:

$ip -f mpls route show
100 as to 200 via inet 10.1.1.2 dev swp1

$ip -f mpls route del 100 as 200 via inet 10.1.1.2 dev swp1

$ip -f mpls route show

Always set type to RTN_UNICAST for mpls route add/deletes.
Also to keep things consistent with kernel set scope to
RT_SCOPE_UNIVERSE for both mpls and ipv6 routes. Both mpls and ipv6 route
deletes ignore scope.

Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Vivek Venkataraman <vivek@cumulusnetworks.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
(cherry picked from commit f638e9f7c8)
2015-07-06 14:55:34 -07:00
Jan Engelhardt 0b60e8c016 build: must honor pkg-config flags for libmnl
The build otherwise fails if libmnl does not directly live in a
standard search path.

(cherry picked from commit a6ea668c91)
2015-07-06 14:55:09 -07:00
Gustavo Zacarias 6b40ba172e tipc: make build conditional on having libmnl
Signed-off-by: Gustavo Zacarias <gustavo@zacarias.com.ar>
(cherry picked from commit acfeb55a86)
2015-07-06 14:55:00 -07:00
Michal Kubeček 90f34ef5c9 include: add copy of tipc.h
Copy of kernel include/uapi/linux/tipc.h is needed to build on systems
with pre-3.16 kernel headers.

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
(cherry picked from commit 38db20ff2d)
2015-07-06 14:54:34 -07:00
731 changed files with 33386 additions and 121544 deletions

View File

@ -1,130 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
#
# clang-format configuration file. Intended for clang-format >= 4.
#
# For more information, see:
#
# Documentation/process/clang-format.rst
# https://clang.llvm.org/docs/ClangFormat.html
# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
#
---
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
#AlignEscapedNewlines: Left # Unknown to clang-format-4.0
AlignOperands: true
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: None
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: false
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: true
AfterNamespace: true
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
#AfterExternBlock: false # Unknown to clang-format-5.0
BeforeCatch: false
BeforeElse: false
IndentBraces: false
#SplitEmptyFunction: true # Unknown to clang-format-4.0
#SplitEmptyRecord: true # Unknown to clang-format-4.0
#SplitEmptyNamespace: true # Unknown to clang-format-4.0
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Custom
#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0
BreakBeforeTernaryOperators: false
BreakConstructorInitializersBeforeComma: false
#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: false
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
#CompactNamespaces: false # Unknown to clang-format-4.0
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 8
ContinuationIndentWidth: 8
Cpp11BracedListStyle: false
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
#FixNamespaceComments: false # Unknown to clang-format-4.0
# Taken from:
# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \
# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
# | sort | uniq
ForEachMacros:
- 'list_for_each_entry'
- 'list_for_each_entry_safe'
- 'mnl_attr_for_each_nested'
- 'hlist_for_each'
- 'hlist_for_each_safe'
- 'hlist_for_each_entry'
#IncludeBlocks: Preserve # Unknown to clang-format-5.0
IncludeCategories:
- Regex: '.*'
Priority: 1
IncludeIsMainRegex: '(Test)?$'
IndentCaseLabels: false
#IndentPPDirectives: None # Unknown to clang-format-5.0
IndentWidth: 8
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: Inner
#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0
ObjCBlockIndentWidth: 8
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
# Taken from git's rules
#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0
PenaltyBreakBeforeFirstCallParameter: 30
PenaltyBreakComment: 10
PenaltyBreakFirstLessLess: 0
PenaltyBreakString: 10
PenaltyExcessCharacter: 100
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Right
ReflowComments: false
SortIncludes: false
#SortUsingDeclarations: false # Unknown to clang-format-4.0
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0
#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0
SpaceBeforeParens: ControlStatements
#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp03
TabWidth: 8
UseTab: Always
...

15
.gitignore vendored
View File

@ -1,7 +1,6 @@
# locally generated
Config
static-syms.h
config.*
Config
*.o
*.a
*.so
@ -11,7 +10,6 @@ config.*
# cscope
cscope.*
ncscope.*
tags
TAGS
# git files that we don't want to ignore even it they are dot-files
@ -37,5 +35,12 @@ series
# tests
testsuite/results
testsuite/iproute2/iproute2-this
testsuite/tools/generate_nlmsg
testsuite/tests/ip/link/dev_wo_vf_rate.nl
# doc files generated at runtime
doc/*.aux
doc/*.log
doc/*.toc
doc/*.ps
doc/*.dvi
doc/*.html
doc/*.pdf

View File

@ -1,22 +0,0 @@
#
# This list is used by git-shortlog to fix a few botched name translations
# in the git archive, either because the author's full name was messed up
# and/or not always written the same way, making contributions from the
# same person appearing not to be so or badly displayed.
#
# Format
# Full name <goodaddress> <badaddress>
Steve Wise <larrystevenwise@gmail.com> <swise@opengridcomputing.com>
Steve Wise <larrystevenwise@gmail.com> <swise@chelsio.com>
Stephen Hemminger <stephen@networkplumber.org> <sthemmin@microsoft.com>
Stephen Hemminger <stephen@networkplumber.org> <shemming@brocade.com>
Stephen Hemminger <stephen@networkplumber.org> <stephen.hemminger@vyatta.com>
Stephen Hemminger <stephen@networkplumber.org> <shemminger@vyatta.com>
Stephen Hemminger <stephen@networkplumber.org> <shemminger>
Stephen Hemminger <stephen@networkplumber.org> <shemminger@linux-foundation.org>
Stephen Hemminger <stephen@networkplumber.org> <shemminger@osdl.org>
Stephen Hemminger <stephen@networkplumber.org> <osdl.org!shemminger>
Stephen Hemminger <stephen@networkplumber.org> <osdl.net!shemminger>
David Ahern <dsahern@gmail.com> <dsa@cumulusnetworks.com>

112
Makefile
View File

@ -1,31 +1,12 @@
# SPDX-License-Identifier: GPL-2.0
# Top level Makefile for iproute2
-include config.mk
ifeq ("$(origin V)", "command line")
VERBOSE = $(V)
endif
ifndef VERBOSE
VERBOSE = 0
endif
ifeq ($(VERBOSE),0)
MAKEFLAGS += --no-print-directory
endif
PREFIX?=/usr
LIBDIR?=$(PREFIX)/lib
SBINDIR?=/sbin
CONFDIR?=/etc/iproute2
NETNS_RUN_DIR?=/var/run/netns
NETNS_ETC_DIR?=/etc/netns
DATADIR?=$(PREFIX)/share
HDRDIR?=$(PREFIX)/include/iproute2
DOCDIR?=$(DATADIR)/doc/iproute2
MANDIR?=$(DATADIR)/man
ARPDDIR?=/var/lib/arpd
KERNEL_INCLUDE?=/usr/include
BASH_COMPDIR?=$(DATADIR)/bash-completion/completions
# Path to db_185.h include
DBM_INCLUDE:=$(DESTDIR)/usr/include
@ -37,101 +18,70 @@ ifneq ($(SHARED_LIBS),y)
DEFINES+= -DNO_SHARED_LIBS
endif
DEFINES+=-DCONFDIR=\"$(CONFDIR)\" \
-DNETNS_RUN_DIR=\"$(NETNS_RUN_DIR)\" \
-DNETNS_ETC_DIR=\"$(NETNS_ETC_DIR)\"
DEFINES+=-DCONFDIR=\"$(CONFDIR)\"
#options for AX.25
ADDLIB+=ax25_ntop.o
#options for decnet
ADDLIB+=dnet_ntop.o dnet_pton.o
#options for AX.25
ADDLIB+=rose_ntop.o
#options for ipx
ADDLIB+=ipx_ntop.o ipx_pton.o
#options for mpls
ADDLIB+=mpls_ntop.o mpls_pton.o
#options for NETROM
ADDLIB+=netrom_ntop.o
CC := gcc
HOSTCC ?= $(CC)
CC = gcc
HOSTCC = gcc
DEFINES += -D_GNU_SOURCE
# Turn on transparent support for LFS
DEFINES += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
CCOPTS = -O2 -pipe
CCOPTS = -O2
WFLAGS := -Wall -Wstrict-prototypes -Wmissing-prototypes
WFLAGS += -Wmissing-declarations -Wold-style-definition -Wformat=2
CFLAGS := $(WFLAGS) $(CCOPTS) -I../include -I../include/uapi $(DEFINES) $(CFLAGS)
CFLAGS := $(WFLAGS) $(CCOPTS) -I../include $(DEFINES) $(CFLAGS)
YACCFLAGS = -d -t -v
SUBDIRS=lib ip tc bridge misc netem genl tipc devlink rdma dcb man vdpa
SUBDIRS=lib ip tc bridge misc netem genl tipc man
LIBNETLINK=../lib/libutil.a ../lib/libnetlink.a
LIBNETLINK=../lib/libnetlink.a ../lib/libutil.a
LDLIBS += $(LIBNETLINK)
all: config.mk
all: Config
@set -e; \
for i in $(SUBDIRS); \
do echo; echo $$i; $(MAKE) -C $$i; done
do $(MAKE) $(MFLAGS) -C $$i; done
.PHONY: clean clobber distclean check cscope version
help:
@echo "Make Targets:"
@echo " all - build binaries"
@echo " clean - remove products of build"
@echo " distclean - remove configuration and build"
@echo " install - install binaries on local machine"
@echo " check - run tests"
@echo " cscope - build cscope database"
@echo " version - update version"
@echo ""
@echo "Make Arguments:"
@echo " V=[0|1] - set build verbosity level"
config.mk:
@if [ ! -f config.mk -o configure -nt config.mk ]; then \
sh configure $(KERNEL_INCLUDE); \
fi
Config:
sh configure $(KERNEL_INCLUDE)
install: all
install -m 0755 -d $(DESTDIR)$(SBINDIR)
install -m 0755 -d $(DESTDIR)$(CONFDIR)
install -m 0755 -d $(DESTDIR)$(ARPDDIR)
install -m 0755 -d $(DESTDIR)$(HDRDIR)
@for i in $(SUBDIRS); do $(MAKE) -C $$i install; done
install -m 0755 -d $(DESTDIR)$(DOCDIR)/examples
install -m 0755 -d $(DESTDIR)$(DOCDIR)/examples/diffserv
install -m 0644 README.iproute2+tc $(shell find examples -maxdepth 1 -type f) \
$(DESTDIR)$(DOCDIR)/examples
install -m 0644 $(shell find examples/diffserv -maxdepth 1 -type f) \
$(DESTDIR)$(DOCDIR)/examples/diffserv
@for i in $(SUBDIRS) doc; do $(MAKE) -C $$i install; done
install -m 0644 $(shell find etc/iproute2 -maxdepth 1 -type f) $(DESTDIR)$(CONFDIR)
install -m 0755 -d $(DESTDIR)$(BASH_COMPDIR)
install -m 0644 bash-completion/tc $(DESTDIR)$(BASH_COMPDIR)
install -m 0644 bash-completion/devlink $(DESTDIR)$(BASH_COMPDIR)
install -m 0644 include/bpf_elf.h $(DESTDIR)$(HDRDIR)
version:
echo "static const char version[] = \""`git describe --tags --long`"\";" \
> include/version.h
snapshot:
echo "static const char SNAPSHOT[] = \""`date +%y%m%d`"\";" \
> include/SNAPSHOT.h
clean:
@for i in $(SUBDIRS) testsuite; \
do $(MAKE) -C $$i clean; done
@for i in $(SUBDIRS) doc; \
do $(MAKE) $(MFLAGS) -C $$i clean; done
clobber:
touch config.mk
$(MAKE) clean
rm -f config.mk cscope.*
touch Config
$(MAKE) $(MFLAGS) clean
rm -f Config cscope.*
distclean: clobber
check: all
$(MAKE) -C testsuite
$(MAKE) -C testsuite alltests
@if command -v man >/dev/null 2>&1; then \
echo "Checking manpages for syntax errors..."; \
$(MAKE) -C man check; \
else \
echo "man not installed, skipping checks for syntax errors."; \
fi
cscope:
cscope -b -q -R -Iinclude -sip -slib -smisc -snetem -stc

33
README
View File

@ -1,39 +1,40 @@
This is a set of utilities for Linux networking.
Information:
https://wiki.linuxfoundation.org/networking/iproute2
http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
Download:
http://www.kernel.org/pub/linux/utils/net/iproute2/
Stable version repository:
git://git.kernel.org/pub/scm/network/iproute2/iproute2.git
Development repository:
git://git.kernel.org/pub/scm/network/iproute2/iproute2-next.git
Repository:
git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git
How to compile this.
--------------------
1. libdbm
arpd needs to have the berkeleydb development libraries. For Debian
users this is the package with a name like libdbX.X-dev.
arpd needs to have the db4 development libraries. For Debian
users this is the package with a name like libdb4.x-dev.
DBM_INCLUDE points to the directory with db_185.h which
is the include file used by arpd to get to the old format Berkeley
database routines. Often this is in the db-devel package.
2. make
The makefile will automatically build a config.mk file which
contains definitions of libraries that may or may not be available
on the system such as: ATM, ELF, MNL, and SELINUX.
The makefile will automatically build a Config file which
contains whether or not ATM is available, etc.
3. include/uapi
3. To make documentation, cd to doc/ directory , then
look at start of Makefile and set correct values for
PAGESIZE=a4 , ie: a4 , letter ... (string)
PAGESPERPAGE=2 , ie: 1 , 2 ... (numeric)
and make there. It assumes, that latex, dvips and psnup
are in your path.
This package includes matching sanitized kernel headers because
the build environment may not have up to date versions. See Makefile
if you have special requirements and need to point at different
kernel include files.
4. This package includes matching sanitized kernel headers because
the build environment may not have up to date versions. See Makefile
if you have special requirements and need to point at different
kernel include files.
Stephen Hemminger
stephen@networkplumber.org

33
README.decnet Normal file
View File

@ -0,0 +1,33 @@
Here are a few quick points about DECnet support...
o iproute2 is the tool of choice for configuring the DECnet support for
Linux. For many features, it is the only tool which can be used to
configure them.
o No name resolution is available as yet, all addresses must be
entered numerically.
o Remember to set the hardware address of the interface using:
ip link set ethX address xx:xx:xx:xx:xx:xx
(where xx:xx:xx:xx:xx:xx is the MAC address for your DECnet node
address)
if your Ethernet card won't listen to more than one unicast
mac address at once. If the Linux DECnet stack doesn't talk to
any other DECnet nodes, then check this with tcpdump and if its
a problem, change the mac address (but do this _before_ starting
any other network protocol on the interface)
o Whilst you can use ip addr add to add more than one DECnet address to an
interface, don't expect addresses which are not the same as the
kernels node address to work properly with 2.4 kernels. This should
be fine with 2.6 kernels as the routing code has been extensively
modified and improved.
o The DECnet support is currently self contained. It does not depend on
the libdnet library.
Steve Whitehouse <steve@chygwyn.com>

View File

@ -4,15 +4,12 @@ development. Most new features require a kernel and a utility component.
Please submit both to the Linux networking mailing list
<netdev@vger.kernel.org>
The current source for the stable version is in the git repository:
git://git.kernel.org/pub/scm/network/iproute2/iproute2.git
The current source is in the git repository:
git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git
The development git repository is available at the following address:
git://git.kernel.org/pub/scm/network/iproute2/iproute2-next.git
The master branch contains the source corresponding to the current
code in the mainline Linux kernel (ie follows Linus). The net-next
branch is a temporary branch that tracks the code intended for the
next release; it corresponds with networking development branch in
the kernel.
The stable repository contains the source corresponding to the
current code in the Linux networking tree (net), which in turn is
aligned on the mainline Linux kernel (ie follows Linus).
The iproute2-next repository tracks the code intended for the next
release; it corresponds with networking development tree (net-next)
in the kernel.

95
README.distribution Normal file
View File

@ -0,0 +1,95 @@
I. About the distribution tables
The table used for "synthesizing" the distribution is essentially a scaled,
translated, inverse to the cumulative distribution function.
Here's how to think about it: Let F() be the cumulative distribution
function for a probability distribution X. We'll assume we've scaled
things so that X has mean 0 and standard deviation 1, though that's not
so important here. Then:
F(x) = P(X <= x) = \int_{-inf}^x f
where f is the probability density function.
F is monotonically increasing, so has an inverse function G, with range
0 to 1. Here, G(t) = the x such that P(X <= x) = t. (In general, G may
have singularities if X has point masses, i.e., points x such that
P(X = x) > 0.)
Now we create a tabular representation of G as follows: Choose some table
size N, and for the ith entry, put in G(i/N). Let's call this table T.
The claim now is, I can create a (discrete) random variable Y whose
distribution has the same approximate "shape" as X, simply by letting
Y = T(U), where U is a discrete uniform random variable with range 1 to N.
To see this, it's enough to show that Y's cumulative distribution function,
(let's call it H), is a discrete approximation to F. But
H(x) = P(Y <= x)
= (# of entries in T <= x) / N -- as Y chosen uniformly from T
= i/N, where i is the largest integer such that G(i/N) <= x
= i/N, where i is the largest integer such that i/N <= F(x)
-- since G and F are inverse functions (and F is
increasing)
= floor(N*F(x))/N
as desired.
II. How to create distribution tables (in theory)
How can we create this table in practice? In some cases, F may have a
simple expression which allows evaluating its inverse directly. The
Pareto distribution is one example of this. In other cases, and
especially for matching an experimentally observed distribution, it's
easiest simply to create a table for F and "invert" it. Here, we give
a concrete example, namely how the new "experimental" distribution was
created.
1. Collect enough data points to characterize the distribution. Here, I
collected 25,000 "ping" roundtrip times to a "distant" point (time.nist.gov).
That's far more data than is really necessary, but it was fairly painless to
collect it, so...
2. Normalize the data so that it has mean 0 and standard deviation 1.
3. Determine the cumulative distribution. The code I wrote creates a table
covering the range -10 to +10, with granularity .00005. Obviously, this
is absurdly over-precise, but since it's a one-time only computation, I
figured it hardly mattered.
4. Invert the table: for each table entry F(x) = y, make the y*TABLESIZE
(here, 4096) entry be x*TABLEFACTOR (here, 8192). This creates a table
for the ("normalized") inverse of size TABLESIZE, covering its domain 0
to 1 with granularity 1/TABLESIZE. Note that even with the granularity
used in creating the table for F, it's possible not all the entries in
the table for G will be filled in. So, make a pass through the
inverse's table, filling in any missing entries by linear interpolation.
III. How to create distribution tables (in practice)
If you want to do all this yourself, I've provided several tools to help:
1. maketable does the steps 2-4 above, and then generates the appropriate
header file. So if you have your own time distribution, you can generate
the header simply by:
maketable < time.values > header.h
2. As explained in the other README file, the somewhat sleazy way I have
of generating correlated values needs correction. You can generate your
own correction tables by compiling makesigtable and makemutable with
your header file. Check the Makefile to see how this is done.
3. Warning: maketable, makesigtable and especially makemutable do
enormous amounts of floating point arithmetic. Don't try running
these on an old 486. (NIST Net itself will run fine on such a
system, since in operation, it just needs to do a few simple integral
calculations. But getting there takes some work.)
4. The tables produced are all normalized for mean 0 and standard
deviation 1. How do you know what values to use for real? Here, I've
provided a simple "stats" utility. Give it a series of floating point
values, and it will return their mean (mu), standard deviation (sigma),
and correlation coefficient (rho). You can then plug these values
directly into NIST Net.

123
README.iproute2+tc Normal file
View File

@ -0,0 +1,123 @@
iproute2+tc*
It's the first release of Linux traffic control engine.
NOTES.
* csz scheduler is inoperational at the moment, and probably
never will be repaired but replaced with h-pfq scheduler.
* To use "fw" classifier you will need ipfwchains patch.
* No manual available. Ask me, if you have problems (only try to guess
answer yourself at first 8)).
Micro-manual how to start it the first time
-------------------------------------------
A. Attach CBQ to eth1:
tc qdisc add dev eth1 root handle 1: cbq bandwidth 10Mbit allot 1514 cell 8 \
avpkt 1000 mpu 64
B. Add root class:
tc class add dev eth1 parent 1:0 classid 1:1 cbq bandwidth 10Mbit rate 10Mbit \
allot 1514 cell 8 weight 1Mbit prio 8 maxburst 20 avpkt 1000
C. Add default interactive class:
tc class add dev eth1 parent 1:1 classid 1:2 cbq bandwidth 10Mbit rate 1Mbit \
allot 1514 cell 8 weight 100Kbit prio 3 maxburst 20 avpkt 1000 split 1:0 \
defmap c0
D. Add default class:
tc class add dev eth1 parent 1:1 classid 1:3 cbq bandwidth 10Mbit rate 8Mbit \
allot 1514 cell 8 weight 800Kbit prio 7 maxburst 20 avpkt 1000 split 1:0 \
defmap 3f
etc. etc. etc. Well, it is enough to start 8) The rest can be guessed 8)
Look also at more elaborated example, ready to start rsvpd,
in rsvp/cbqinit.eth1.
Terminology and advices about setting CBQ parameters may be found in Sally Floyd
papers.
Pairs X:Y are class handles, X:0 are qdisc handles.
weight should be proportional to rate for leaf classes
(I choosed it ten times less, but it is not necessary)
defmap is bitmap of logical priorities served by this class.
E. Another qdiscs are simpler. F.e. let's join TBF on class 1:2
tc qdisc add dev eth1 parent 1:2 tbf rate 64Kbit buffer 5Kb/8 limit 10Kb
F. Look at all that we created:
tc qdisc ls dev eth1
tc class ls dev eth1
G. Install "route" classifier on root of cbq and map destination from realm
1 to class 1:2
tc filter add dev eth1 parent 1:0 protocol ip prio 100 route to 1 classid 1:2
H. Assign routes to 10.11.12.0/24 to realm 1
ip route add 10.11.12.0/24 dev eth1 via whatever realm 1
etc. The same thing can be made with rules.
I still did not test ipchains, but they should work too.
Setup and code example of BPF classifier and action can be found under
examples/bpf/, which should explain everything for getting started.
Setup of rsvp and u32 classifiers is more hairy.
If you read RSVP specs, you will understand how rsvp classifier
works easily. What's about u32... That's example:
#! /bin/sh
TC=/home/root/tc
# Setup classifier root on eth1 root (it is cbq)
$TC filter add dev eth1 parent 1:0 prio 5 protocol ip u32
# Create hash table of 256 slots with ID 1:
$TC filter add dev eth1 parent 1:0 prio 5 handle 1: u32 divisor 256
# Add to 6th slot of hash table rule to select tcp/telnet to 193.233.7.75
# direct it to class 1:4 and prescribe to fall to best effort,
# if traffic violate TBF (32kbit,5K)
$TC filter add dev eth1 parent 1:0 prio 5 u32 ht 1:6: \
match ip dst 193.233.7.75 \
match tcp dst 0x17 0xffff \
flowid 1:4 \
police rate 32kbit buffer 5kb/8 mpu 64 mtu 1514 index 1
# Add to 1th slot of hash table rule to select icmp to 193.233.7.75
# direct it to class 1:4 and prescribe to fall to best effort,
# if traffic violate TBF (10kbit,5K)
$TC filter add dev eth1 parent 1:0 prio 5 u32 ht 1:: \
sample ip protocol 1 0xff \
match ip dst 193.233.7.75 \
flowid 1:4 \
police rate 10kbit buffer 5kb/8 mpu 64 mtu 1514 index 2
# Lookup hash table, if it is not fragmented frame
# Use protocol as hash key
$TC filter add dev eth1 parent 1:0 prio 5 handle ::1 u32 ht 800:: \
match ip nofrag \
offset mask 0x0F00 shift 6 \
hashkey mask 0x00ff0000 at 8 \
link 1:
Alexey Kuznetsov
kuznet@ms2.inr.ac.ru

81
README.lnstat Normal file
View File

@ -0,0 +1,81 @@
lnstat - linux networking statistics
(C) 2004 Harald Welte <laforge@gnumonks.org
======================================================================
This tool is a generalized and more feature-complete replacement for the old
'rtstat' program.
In addition to routing cache statistics, it supports any kind of statistics
the linux kernel exports via a file in /proc/net/stat. In a stock 2.6.9
kernel, this is
per-protocol neighbour cache statistics
(ipv4, ipv6, atm, decnet)
routing cache statistics
(ipv4)
connection tracking statistics
(ipv4)
Please note that lnstat will adopt to any additional statistics that might be
added to the kernel at some later point
I personally always like examples more than any reference documentation, so I
list the following examples. If somebody wants to do a manpage, feel free
to send me a patch :)
EXAMPLES:
In order to get a list of supported statistics files, you can run
lnstat -d
It will display something like
/proc/net/stat/arp_cache:
1: entries
2: allocs
3: destroys
[...]
/proc/net/stat/rt_cache:
1: entries
2: in_hit
3: in_slow_tot
You can now select the files/keys you are interested by something like
lnstat -k arp_cache:entries,rt_cache:in_hit,arp_cache:destroys
arp_cach|rt_cache|arp_cach|
entries| in_hit|destroys|
6| 6| 0|
6| 0| 0|
6| 2| 0|
You can specify the interval (e.g. 10 seconds) by:
lnstat -i 10
You can specify to only use one particular statistics file:
lnstat -f ip_conntrack
You can specify individual field widths
lnstat -k arp_cache:entries,rt_cache:entries -w 20,8
You can specify not to print a header at all
lnstat -s 0
You can specify to print a header only at start of the program
lnstat -s 1
You can specify to print a header at start and every 20 lines:
lnstat -s 20
You can specify the number of samples you want to take (e.g. 5):
lnstat -c 5

File diff suppressed because it is too large Load Diff

View File

@ -1,809 +0,0 @@
# tc(8) completion -*- shell-script -*-
# Copyright 2016 6WIND S.A.
# Copyright 2016 Quentin Monnet <quentin.monnet@6wind.com>
QDISC_KIND=' choke codel bfifo pfifo pfifo_head_drop fq fq_codel gred hhf \
mqprio multiq netem pfifo_fast pie fq_pie red rr sfb sfq tbf atm \
cbq drr dsmark hfsc htb prio qfq '
FILTER_KIND=' basic bpf cgroup flow flower fw route rsvp tcindex u32 matchall '
ACTION_KIND=' gact mirred bpf sample '
# Takes a list of words in argument; each one of them is added to COMPREPLY if
# it is not already present on the command line. Returns no value.
_tc_once_attr()
{
local w subcword found
for w in $*; do
found=0
for (( subcword=3; subcword < ${#words[@]}-1; subcword++ )); do
if [[ $w == ${words[subcword]} ]]; then
found=1
break
fi
done
[[ $found -eq 0 ]] && \
COMPREPLY+=( $( compgen -W "$w" -- "$cur" ) )
done
}
# Takes a list of words in argument; each one of them is added to COMPREPLY if
# it is not already present on the command line from the provided index. Returns
# no value.
_tc_once_attr_from()
{
local w subcword found from=$1
shift
for w in $*; do
found=0
for (( subcword=$from; subcword < ${#words[@]}-1; subcword++ )); do
if [[ $w == ${words[subcword]} ]]; then
found=1
break
fi
done
[[ $found -eq 0 ]] && \
COMPREPLY+=( $( compgen -W "$w" -- "$cur" ) )
done
}
# Takes a list of words in argument; adds them all to COMPREPLY if none of them
# is already present on the command line. Returns no value.
_tc_one_of_list()
{
local w subcword
for w in $*; do
for (( subcword=3; subcword < ${#words[@]}-1; subcword++ )); do
[[ $w == ${words[subcword]} ]] && return 1
done
done
COMPREPLY+=( $( compgen -W "$*" -- "$cur" ) )
}
# Takes a list of words in argument; adds them all to COMPREPLY if none of them
# is already present on the command line from the provided index. Returns no
# value.
_tc_one_of_list_from()
{
local w subcword from=$1
shift
for w in $*; do
for (( subcword=$from; subcword < ${#words[@]}-1; subcword++ )); do
[[ $w == ${words[subcword]} ]] && return 1
done
done
COMPREPLY+=( $( compgen -W "$*" -- "$cur" ) )
}
# Returns "$cur ${cur}arg1 ${cur}arg2 ..."
_tc_expand_units()
{
[[ $cur =~ ^[0-9]+ ]] || return 1
local value=${cur%%[^0-9]*}
[[ $cur == $value ]] && echo $cur
echo ${@/#/$value}
}
# Complete based on given word, usually $prev (or possibly the word before),
# for when an argument or an option name has but a few possible arguments (so
# tc does not take particular commands into account here).
# Returns 0 is completion should stop after running this function, 1 otherwise.
_tc_direct_complete()
{
case $1 in
# Command options
dev)
_available_interfaces
return 0
;;
classid)
return 0
;;
estimator)
local list=$( _tc_expand_units 'secs' 'msecs' 'usecs' )
COMPREPLY+=( $( compgen -W "$list" -- "$cur" ) )
return 0
;;
handle)
return 0
;;
parent|flowid)
local i iface ids cmd
for (( i=3; i < ${#words[@]}-2; i++ )); do
[[ ${words[i]} == dev ]] && iface=${words[i+1]}
break
done
for cmd in qdisc class; do
if [[ -n $iface ]]; then
ids+=$( tc $cmd show dev $iface 2>/dev/null | \
cut -d\ -f 3 )" "
else
ids+=$( tc $cmd show 2>/dev/null | cut -d\ -f 3 )
fi
done
[[ $ids != " " ]] && \
COMPREPLY+=( $( compgen -W "$ids" -- "$cur" ) )
return 0
;;
protocol) # list comes from lib/ll_proto.c
COMPREPLY+=( $( compgen -W ' 802.1Q 802.1ad 802_2 802_3 LLDP aarp \
all aoe arp atalk atmfate atmmpoa ax25 bpq can control cust \
ddcmp dec diag dna_dl dna_rc dna_rt econet ieeepup ieeepupat \
ip ipv4 ipv6 ipx irda lat localtalk loop mobitex ppp_disc \
ppp_mp ppp_ses ppptalk pup pupat rarp sca snap tipc tr_802_2 \
wan_ppp x25' -- "$cur" ) )
return 0
;;
prio)
return 0
;;
stab)
COMPREPLY+=( $( compgen -W 'mtu tsize mpu overhead
linklayer' -- "$cur" ) )
;;
# Qdiscs and classes options
alpha|bands|beta|buckets|corrupt|debug|decrement|default|\
default_index|depth|direct_qlen|divisor|duplicate|ewma|flow_limit|\
flows|hh_limit|increment|indices|linklayer|non_hh_weight|num_tc|\
penalty_burst|penalty_rate|prio|priomap|probability|queues|r2q|\
reorder|vq|vqs)
return 0
;;
setup)
COMPREPLY+=( $( compgen -W 'vqs' -- "$cur" ) )
return 0
;;
hw)
COMPREPLY+=( $( compgen -W '1 0' -- "$cur" ) )
return 0
;;
distribution)
COMPREPLY+=( $( compgen -W 'uniform normal pareto
paretonormal' -- "$cur" ) )
return 0
;;
loss)
COMPREPLY+=( $( compgen -W 'random state gmodel' -- "$cur" ) )
return 0
;;
# Qdiscs and classes options options
gap|gmodel|state)
return 0
;;
# Filters options
map)
COMPREPLY+=( $( compgen -W 'key' -- "$cur" ) )
return 0
;;
hash)
COMPREPLY+=( $( compgen -W 'keys' -- "$cur" ) )
return 0
;;
indev)
_available_interfaces
return 0
;;
eth_type)
COMPREPLY+=( $( compgen -W 'ipv4 ipv6' -- "$cur" ) )
return 0
;;
ip_proto)
COMPREPLY+=( $( compgen -W 'tcp udp' -- "$cur" ) )
return 0
;;
# Filters options options
key|keys)
[[ ${words[@]} =~ graft ]] && return 1
COMPREPLY+=( $( compgen -W 'src dst proto proto-src proto-dst iif \
priority mark nfct nfct-src nfct-dst nfct-proto-src \
nfct-proto-dst rt-classid sk-uid sk-gid vlan-tag rxhash' -- \
"$cur" ) )
return 0
;;
# BPF options - used for filters, actions, and exec
export|bytecode|bytecode-file|object-file)
_filedir
return 0
;;
object-pinned|graft) # Pinned object is probably under /sys/fs/bpf/
[[ -n "$cur" ]] && _filedir && return 0
COMPREPLY=( $( compgen -G "/sys/fs/bpf/*" -- "$cur" ) ) || _filedir
compopt -o nospace
return 0
;;
section)
if (type objdump > /dev/null 2>&1) ; then
local fword objfile section_list
for (( fword=3; fword < ${#words[@]}-3; fword++ )); do
if [[ ${words[fword]} == object-file ]]; then
objfile=${words[fword+1]}
break
fi
done
section_list=$( objdump -h $objfile 2>/dev/null | \
sed -n 's/^ *[0-9]\+ \([^ ]*\) *.*/\1/p' )
COMPREPLY+=( $( compgen -W "$section_list" -- "$cur" ) )
fi
return 0
;;
import|run)
_filedir
return 0
;;
type)
COMPREPLY+=( $( compgen -W 'cls act' -- "$cur" ) )
return 0
;;
# Actions options
random)
_tc_one_of_list 'netrand determ'
return 0
;;
# Units for option arguments
bandwidth|maxrate|peakrate|rate)
local list=$( _tc_expand_units 'bit' \
'kbit' 'kibit' 'kbps' 'kibps' \
'mbit' 'mibit' 'mbps' 'mibps' \
'gbit' 'gibit' 'gbps' 'gibps' \
'tbit' 'tibit' 'tbps' 'tibps' )
COMPREPLY+=( $( compgen -W "$list" -- "$cur" ) )
;;
admit_bytes|avpkt|burst|cell|initial_quantum|limit|max|min|mtu|mpu|\
overhead|quantum|redflowlist)
local list=$( _tc_expand_units \
'b' 'kbit' 'k' 'mbit' 'm' 'gbit' 'g' )
COMPREPLY+=( $( compgen -W "$list" -- "$cur" ) )
;;
db|delay|evict_timeout|interval|latency|perturb|rehash|reset_timeout|\
target|tupdate)
local list=$( _tc_expand_units 'secs' 'msecs' 'usecs' )
COMPREPLY+=( $( compgen -W "$list" -- "$cur" ) )
;;
esac
return 1
}
# Complete with options names for qdiscs. Each qdisc has its own set of options
# and it seems we cannot really parse it from anywhere, so we add it manually
# in this function.
# Returns 0 is completion should stop after running this function, 1 otherwise.
_tc_qdisc_options()
{
case $1 in
choke)
_tc_once_attr 'limit bandwidth ecn min max burst'
return 0
;;
codel)
_tc_once_attr 'limit target interval'
_tc_one_of_list 'ecn noecn'
return 0
;;
bfifo|pfifo|pfifo_head_drop)
_tc_once_attr 'limit'
return 0
;;
fq)
_tc_once_attr 'limit flow_limit quantum initial_quantum maxrate \
buckets'
_tc_one_of_list 'pacing nopacing'
return 0
;;
fq_codel)
_tc_once_attr 'limit flows target interval quantum'
_tc_one_of_list 'ecn noecn'
return 0
;;
gred)
_tc_once_attr 'setup vqs default grio vq prio limit min max avpkt \
burst probability bandwidth ecn harddrop'
return 0
;;
hhf)
_tc_once_attr 'limit quantum hh_limit reset_timeout admit_bytes \
evict_timeout non_hh_weight'
return 0
;;
mqprio)
_tc_once_attr 'num_tc map queues hw'
return 0
;;
netem)
_tc_once_attr 'delay distribution corrupt duplicate loss ecn \
reorder rate'
return 0
;;
pie)
_tc_once_attr 'limit target tupdate alpha beta'
_tc_one_of_list 'bytemode nobytemode'
_tc_one_of_list 'ecn noecn'
_tc_one_of_list 'dq_rate_estimator no_dq_rate_estimator'
return 0
;;
fq_pie)
_tc_once_attr 'limit flows target tupdate \
alpha beta quantum memory_limit ecn_prob'
_tc_one_of_list 'ecn noecn'
_tc_one_of_list 'bytemode nobytemode'
_tc_one_of_list 'dq_rate_estimator no_dq_rate_estimator'
return 0
;;
red)
_tc_once_attr 'limit min max avpkt burst adaptive probability \
bandwidth ecn harddrop'
return 0
;;
rr|prio)
_tc_once_attr 'bands priomap multiqueue'
return 0
;;
sfb)
_tc_once_attr 'rehash db limit max target increment decrement \
penalty_rate penalty_burst'
return 0
;;
sfq)
_tc_once_attr 'limit perturb quantum divisor flows depth headdrop \
redflowlimit min max avpkt burst probability ecn harddrop'
return 0
;;
tbf)
_tc_once_attr 'limit burst rate mtu peakrate latency overhead \
linklayer'
return 0
;;
cbq)
_tc_once_attr 'bandwidth avpkt mpu cell ewma'
return 0
;;
dsmark)
_tc_once_attr 'indices default_index set_tc_index'
return 0
;;
hfsc)
_tc_once_attr 'default'
return 0
;;
htb)
_tc_once_attr 'default r2q direct_qlen debug'
return 0
;;
multiq|pfifo_fast|atm|drr|qfq)
return 0
;;
esac
return 1
}
# Complete with options names for BPF filters or actions.
# Returns 0 is completion should stop after running this function, 1 otherwise.
_tc_bpf_options()
{
[[ ${words[${#words[@]}-3]} == object-file ]] && \
_tc_once_attr 'section export'
[[ ${words[${#words[@]}-5]} == object-file ]] && \
[[ ${words[${#words[@]}-3]} =~ (section|export) ]] && \
_tc_once_attr 'section export'
_tc_one_of_list 'bytecode bytecode-file object-file object-pinned'
_tc_once_attr 'verbose index direct-action action classid'
return 0
}
# Complete with options names for filter actions.
# This function is recursive, thus allowing multiple actions statement to be
# parsed.
# Returns 0 is completion should stop after running this function, 1 otherwise.
_tc_filter_action_options()
{
for ((acwd=$1; acwd < ${#words[@]}-1; acwd++));
do
if [[ action == ${words[acwd]} ]]; then
_tc_filter_action_options $((acwd+1)) && return 0
fi
done
local action acwd
for ((acwd=$1; acwd < ${#words[@]}-1; acwd++)); do
if [[ $ACTION_KIND =~ ' '${words[acwd]}' ' ]]; then
_tc_one_of_list_from $acwd action
_tc_action_options $acwd && return 0
fi
done
_tc_one_of_list_from $acwd $ACTION_KIND
return 0
}
# Complete with options names for filters.
# Returns 0 is completion should stop after running this function, 1 otherwise.
_tc_filter_options()
{
for ((acwd=$1; acwd < ${#words[@]}-1; acwd++));
do
if [[ action == ${words[acwd]} ]]; then
_tc_filter_action_options $((acwd+1)) && return 0
fi
done
filter=${words[$1]}
case $filter in
basic)
_tc_once_attr 'match action classid'
return 0
;;
bpf)
_tc_bpf_options
return 0
;;
cgroup)
_tc_once_attr 'match action'
return 0
;;
flow)
local i
for (( i=5; i < ${#words[@]}-1; i++ )); do
if [[ ${words[i]} =~ ^keys?$ ]]; then
_tc_direct_complete 'key'
COMPREPLY+=( $( compgen -W 'or and xor rshift addend' -- \
"$cur" ) )
break
fi
done
_tc_once_attr 'map hash divisor baseclass match action'
return 0
;;
matchall)
_tc_once_attr 'action classid skip_sw skip_hw'
return 0
;;
flower)
_tc_once_attr 'action classid indev dst_mac src_mac eth_type \
ip_proto dst_ip src_ip dst_port src_port'
return 0
;;
fw)
_tc_once_attr 'action classid'
return 0
;;
route)
_tc_one_of_list 'from fromif'
_tc_once_attr 'to classid action'
return 0
;;
rsvp)
_tc_once_attr 'ipproto session sender classid action tunnelid \
tunnel flowlabel spi/ah spi/esp u8 u16 u32'
[[ ${words[${#words[@]}-3]} == tunnel ]] && \
COMPREPLY+=( $( compgen -W 'skip' -- "$cur" ) )
[[ ${words[${#words[@]}-3]} =~ u(8|16|32) ]] && \
COMPREPLY+=( $( compgen -W 'mask' -- "$cur" ) )
[[ ${words[${#words[@]}-3]} == mask ]] && \
COMPREPLY+=( $( compgen -W 'at' -- "$cur" ) )
return 0
;;
tcindex)
_tc_once_attr 'hash mask shift classid action'
_tc_one_of_list 'pass_on fall_through'
return 0
;;
u32)
_tc_once_attr 'match link classid action offset ht hashkey sample'
COMPREPLY+=( $( compgen -W 'ip ip6 udp tcp icmp u8 u16 u32 mark \
divisor' -- "$cur" ) )
return 0
;;
esac
return 1
}
# Complete with options names for actions.
# Returns 0 is completion should stop after running this function, 1 otherwise.
_tc_action_options()
{
local from=$1
local action=${words[from]}
case $action in
bpf)
_tc_bpf_options
return 0
;;
mirred)
_tc_one_of_list_from $from 'ingress egress'
_tc_one_of_list_from $from 'mirror redirect'
_tc_once_attr_from $from 'index dev'
return 0
;;
sample)
_tc_once_attr_from $from 'rate'
_tc_once_attr_from $from 'trunc'
_tc_once_attr_from $from 'group'
return 0
;;
gact)
_tc_one_of_list_from $from 'reclassify drop continue pass'
_tc_once_attr_from $from 'random'
return 0
;;
esac
return 1
}
# Complete with options names for exec.
# Returns 0 is completion should stop after running this function, 1 otherwise.
_tc_exec_options()
{
case $1 in
import)
[[ ${words[${#words[@]}-3]} == import ]] && \
_tc_once_attr 'run'
return 0
;;
graft)
COMPREPLY+=( $( compgen -W 'key type' -- "$cur" ) )
[[ ${words[${#words[@]}-3]} == object-file ]] && \
_tc_once_attr 'type'
_tc_bpf_options
return 0
;;
esac
return 1
}
# Main completion function
# Logic is as follows:
# 1. Check if previous word is a global option; if so, propose arguments.
# 2. Check if current word is a global option; if so, propose completion.
# 3. Check for the presence of a main command (qdisc|class|filter|...). If
# there is one, first call _tc_direct_complete to see if previous word is
# waiting for a particular completion. If so, propose completion and exit.
# 4. Extract main command and -- if available -- its subcommand
# (add|delete|show|...).
# 5. Propose completion based on main and sub- command in use. Additional
# functions may be called for qdiscs, classes or filter options.
_tc()
{
local cur prev words cword
_init_completion || return
case $prev in
-V|-Version)
return 0
;;
-b|-batch|-cf|-conf)
_filedir
return 0
;;
-force)
COMPREPLY=( $( compgen -W '-batch' -- "$cur" ) )
return 0
;;
-nm|name)
[[ -r /etc/iproute2/tc_cls ]] || \
COMPREPLY=( $( compgen -W '-conf' -- "$cur" ) )
return 0
;;
-n|-net|-netns)
local nslist=$( ip netns list 2>/dev/null )
COMPREPLY+=( $( compgen -W "$nslist" -- "$cur" ) )
return 0
;;
-tshort)
_tc_once_attr '-statistics'
COMPREPLY+=( $( compgen -W 'monitor' -- "$cur" ) )
return 0
;;
-timestamp)
_tc_once_attr '-statistics -tshort'
COMPREPLY+=( $( compgen -W 'monitor' -- "$cur" ) )
return 0
;;
esac
# Search for main commands
local subcword cmd subcmd
for (( subcword=1; subcword < ${#words[@]}-1; subcword++ )); do
[[ ${words[subcword]} == -b?(atch) ]] && return 0
[[ -n $cmd ]] && subcmd=${words[subcword]} && break
[[ ${words[subcword]} != -* && \
${words[subcword-1]} != -@(n?(et?(ns))|c?(on)f) ]] && \
cmd=${words[subcword]}
done
if [[ -z $cmd ]]; then
case $cur in
-*)
local c='-Version -statistics -details -raw -pretty \
-iec -graphe -batch -name -netns -timestamp'
[[ $cword -eq 1 ]] && c+=' -force'
COMPREPLY=( $( compgen -W "$c" -- "$cur" ) )
return 0
;;
*)
COMPREPLY=( $( compgen -W "help $( tc help 2>&1 | \
command sed \
-e '/OBJECT := /!d' \
-e 's/.*{//' \
-e 's/}.*//' \
-e \ 's/|//g' )" -- "$cur" ) )
return 0
;;
esac
fi
[[ $subcmd == help ]] && return 0
# For this set of commands we may create COMPREPLY just by analysing the
# previous word, if it expects for a specific list of options or values.
if [[ $cmd =~ (qdisc|class|filter|action|exec) ]]; then
_tc_direct_complete $prev && return 0
if [[ ${words[${#words[@]}-3]} == estimator ]]; then
local list=$( _tc_expand_units 'secs' 'msecs' 'usecs' )
COMPREPLY+=( $( compgen -W "$list" -- "$cur" ) ) && return 0
fi
fi
# Completion depends on main command and subcommand in use.
case $cmd in
qdisc)
case $subcmd in
add|change|replace|link|del|delete)
if [[ $(($cword-$subcword)) -eq 1 ]]; then
COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) )
return 0
fi
local qdisc qdwd
for ((qdwd=$subcword; qdwd < ${#words[@]}-1; qdwd++)); do
if [[ $QDISC_KIND =~ ' '${words[qdwd]}' ' ]]; then
qdisc=${words[qdwd]}
_tc_qdisc_options $qdisc && return 0
fi
done
_tc_one_of_list $QDISC_KIND
_tc_one_of_list 'root ingress parent clsact'
_tc_once_attr 'handle estimator stab'
;;
show)
_tc_once_attr 'dev'
_tc_one_of_list 'ingress clsact'
_tc_once_attr '-statistics -details -raw -pretty -iec \
-graph -name'
;;
help)
return 0
;;
*)
[[ $cword -eq $subcword ]] && \
COMPREPLY=( $( compgen -W 'help add delete change \
replace link show' -- "$cur" ) )
;;
esac
;;
class)
case $subcmd in
add|change|replace|del|delete)
if [[ $(($cword-$subcword)) -eq 1 ]]; then
COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) )
return 0
fi
local qdisc qdwd
for ((qdwd=$subcword; qdwd < ${#words[@]}-1; qdwd++)); do
if [[ $QDISC_KIND =~ ' '${words[qdwd]}' ' ]]; then
qdisc=${words[qdwd]}
_tc_qdisc_options $qdisc && return 0
fi
done
_tc_one_of_list $QDISC_KIND
_tc_one_of_list 'root parent'
_tc_once_attr 'classid'
;;
show)
_tc_once_attr 'dev'
_tc_one_of_list 'root parent'
_tc_once_attr '-statistics -details -raw -pretty -iec \
-graph -name'
;;
help)
return 0
;;
*)
[[ $cword -eq $subcword ]] && \
COMPREPLY=( $( compgen -W 'help add delete change \
replace show' -- "$cur" ) )
;;
esac
;;
filter)
case $subcmd in
add|change|replace|del|delete)
if [[ $(($cword-$subcword)) -eq 1 ]]; then
COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) )
return 0
fi
local filter fltwd
for ((fltwd=$subcword; fltwd < ${#words[@]}-1; fltwd++));
do
if [[ $FILTER_KIND =~ ' '${words[fltwd]}' ' ]]; then
_tc_filter_options $fltwd && return 0
fi
done
_tc_one_of_list $FILTER_KIND
_tc_one_of_list 'root ingress egress parent'
_tc_once_attr 'handle estimator pref protocol'
;;
show)
_tc_once_attr 'dev'
_tc_one_of_list 'root ingress egress parent'
_tc_once_attr '-statistics -details -raw -pretty -iec \
-graph -name'
;;
help)
return 0
;;
*)
[[ $cword -eq $subcword ]] && \
COMPREPLY=( $( compgen -W 'help add delete change \
replace show' -- "$cur" ) )
;;
esac
;;
action)
case $subcmd in
add|change|replace)
local action acwd
for ((acwd=$subcword; acwd < ${#words[@]}-1; acwd++)); do
if [[ $ACTION_KIND =~ ' '${words[acwd]}' ' ]]; then
_tc_action_options $acwd && return 0
fi
done
_tc_one_of_list $ACTION_KIND
;;
get|del|delete)
_tc_once_attr 'index'
;;
lst|list|flush|show)
_tc_one_of_list $ACTION_KIND
;;
*)
[[ $cword -eq $subcword ]] && \
COMPREPLY=( $( compgen -W 'help add delete change \
replace show list flush action' -- "$cur" ) )
;;
esac
;;
monitor)
COMPREPLY=( $( compgen -W 'help' -- "$cur" ) )
;;
exec)
case $subcmd in
bpf)
local excmd exwd EXEC_KIND=' import debug graft '
for ((exwd=$subcword; exwd < ${#words[@]}-1; exwd++)); do
if [[ $EXEC_KIND =~ ' '${words[exwd]}' ' ]]; then
excmd=${words[exwd]}
_tc_exec_options $excmd && return 0
fi
done
_tc_one_of_list $EXEC_KIND
;;
*)
[[ $cword -eq $subcword ]] && \
COMPREPLY=( $( compgen -W 'bpf' -- "$cur" ) )
;;
esac
;;
esac
} &&
complete -F _tc tc
# ex: ts=4 sw=4 et filetype=sh

View File

@ -1,15 +1,18 @@
# SPDX-License-Identifier: GPL-2.0
BROBJ = bridge.o fdb.o monitor.o link.o mdb.o vlan.o
include ../config.mk
include ../Config
ifeq ($(IP_CONFIG_SETNS),y)
CFLAGS += -DHAVE_SETNS
endif
all: bridge
bridge: $(BROBJ) $(LIBNETLINK)
$(QUIET_LINK)$(CC) $^ $(LDFLAGS) $(LDLIBS) -o $@
bridge: $(BROBJ) $(LIBNETLINK)
install: all
install -m 0755 bridge $(DESTDIR)$(SBINDIR)
clean:
rm -f $(BROBJ) bridge

View File

@ -1,31 +1,20 @@
/* SPDX-License-Identifier: GPL-2.0 */
extern int print_linkinfo(const struct sockaddr_nl *who,
struct nlmsghdr *n,
void *arg);
extern int print_fdb(const struct sockaddr_nl *who,
struct nlmsghdr *n, void *arg);
extern int print_mdb(const struct sockaddr_nl *who,
struct nlmsghdr *n, void *arg);
#define MDB_RTA(r) \
((struct rtattr *)(((char *)(r)) + RTA_ALIGN(sizeof(struct br_mdb_entry))))
#define MDB_RTR_RTA(r) \
((struct rtattr *)(((char *)(r)) + RTA_ALIGN(sizeof(__u32))))
void print_vlan_info(struct rtattr *tb, int ifindex);
int print_linkinfo(struct nlmsghdr *n, void *arg);
int print_mdb_mon(struct nlmsghdr *n, void *arg);
int print_fdb(struct nlmsghdr *n, void *arg);
void print_stp_state(__u8 state);
int parse_stp_state(const char *arg);
int print_vlan_rtm(struct nlmsghdr *n, void *arg, bool monitor,
bool global_only);
void br_print_router_port_stats(struct rtattr *pattr);
int do_fdb(int argc, char **argv);
int do_mdb(int argc, char **argv);
int do_monitor(int argc, char **argv);
int do_vlan(int argc, char **argv);
int do_link(int argc, char **argv);
extern int do_fdb(int argc, char **argv);
extern int do_mdb(int argc, char **argv);
extern int do_monitor(int argc, char **argv);
extern int do_vlan(int argc, char **argv);
extern int do_link(int argc, char **argv);
extern int preferred_family;
extern int show_stats;
extern int show_details;
extern int timestamp;
extern int compress_vlans;
extern int json;
extern struct rtnl_handle rth;

View File

@ -1,4 +1,3 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Get/set/delete bridge with netlink
*
@ -10,25 +9,21 @@
#include <unistd.h>
#include <sys/socket.h>
#include <string.h>
#include <errno.h>
#include "version.h"
#include "SNAPSHOT.h"
#include "utils.h"
#include "br_common.h"
#include "namespace.h"
#include "color.h"
struct rtnl_handle rth = { .fd = -1 };
int preferred_family = AF_UNSPEC;
int oneline;
int resolve_hosts;
int oneline = 0;
int show_stats;
int show_details;
static int color;
int compress_vlans;
int json;
int timestamp;
static const char *batch_file;
int force;
char * _SL_ = NULL;
static void usage(void) __attribute__((noreturn));
@ -36,11 +31,10 @@ static void usage(void)
{
fprintf(stderr,
"Usage: bridge [ OPTIONS ] OBJECT { COMMAND | help }\n"
" bridge [ -force ] -batch filename\n"
"where OBJECT := { link | fdb | mdb | vlan | monitor }\n"
" OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] |\n"
" -o[neline] | -t[imestamp] | -n[etns] name |\n"
" -c[ompressvlans] -color -p[retty] -j[son] }\n");
" -c[ompressvlans] }\n");
exit(-1);
}
@ -54,9 +48,9 @@ static const struct cmd {
const char *cmd;
int (*func)(int argc, char **argv);
} cmds[] = {
{ "link", do_link },
{ "fdb", do_fdb },
{ "mdb", do_mdb },
{ "link", do_link },
{ "fdb", do_fdb },
{ "mdb", do_mdb },
{ "vlan", do_vlan },
{ "monitor", do_monitor },
{ "help", do_help },
@ -72,40 +66,16 @@ static int do_cmd(const char *argv0, int argc, char **argv)
return c->func(argc-1, argv+1);
}
fprintf(stderr,
"Object \"%s\" is unknown, try \"bridge help\".\n", argv0);
fprintf(stderr, "Object \"%s\" is unknown, try \"bridge help\".\n", argv0);
return -1;
}
static int br_batch_cmd(int argc, char *argv[], void *data)
{
return do_cmd(argv[0], argc, argv);
}
static int batch(const char *name)
{
int ret;
if (rtnl_open(&rth, 0) < 0) {
fprintf(stderr, "Cannot open rtnetlink\n");
return EXIT_FAILURE;
}
rtnl_set_strict_dump(&rth);
ret = do_batch(name, force, br_batch_cmd, NULL);
rtnl_close(&rth);
return ret;
}
int
main(int argc, char **argv)
{
while (argc > 1) {
const char *opt = argv[1];
if (strcmp(opt, "--") == 0) {
char *opt = argv[1];
if (strcmp(opt,"--") == 0) {
argc--; argv++;
break;
}
@ -117,7 +87,7 @@ main(int argc, char **argv)
if (matches(opt, "-help") == 0) {
usage();
} else if (matches(opt, "-Version") == 0) {
printf("bridge utility, %s\n", version);
printf("bridge utility, 0.0\n");
exit(0);
} else if (matches(opt, "-stats") == 0 ||
matches(opt, "-statistics") == 0) {
@ -128,7 +98,7 @@ main(int argc, char **argv)
++oneline;
} else if (matches(opt, "-timestamp") == 0) {
++timestamp;
} else if (matches(opt, "-family") == 0) {
} else if (matches(opt, "-family") == 0) {
argc--;
argv++;
if (argc <= 1)
@ -149,42 +119,20 @@ main(int argc, char **argv)
NEXT_ARG();
if (netns_switch(argv[1]))
exit(-1);
} else if (matches_color(opt, &color)) {
} else if (matches(opt, "-compressvlans") == 0) {
++compress_vlans;
} else if (matches(opt, "-force") == 0) {
++force;
} else if (matches(opt, "-json") == 0) {
++json;
} else if (matches(opt, "-pretty") == 0) {
++pretty;
} else if (matches(opt, "-batch") == 0) {
argc--;
argv++;
if (argc <= 1)
usage();
batch_file = argv[1];
} else {
fprintf(stderr,
"Option \"%s\" is unknown, try \"bridge help\".\n",
opt);
fprintf(stderr, "Option \"%s\" is unknown, try \"bridge help\".\n", opt);
exit(-1);
}
argc--; argv++;
}
_SL_ = oneline ? "\\" : "\n";
check_enable_color(color, json);
if (batch_file)
return batch(batch_file);
_SL_ = oneline ? "\\" : "\n" ;
if (rtnl_open(&rth, 0) < 0)
exit(1);
rtnl_set_strict_dump(&rth);
if (argc > 1)
return do_cmd(argv[1], argc-1, argv+1);

View File

@ -1,4 +1,3 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Get/set/delete fdb table with netlink
*
@ -22,33 +21,24 @@
#include <linux/neighbour.h>
#include <string.h>
#include <limits.h>
#include <stdbool.h>
#include "json_print.h"
#include "libnetlink.h"
#include "br_common.h"
#include "rt_names.h"
#include "utils.h"
static unsigned int filter_index, filter_dynamic, filter_master,
filter_state, filter_vlan;
static unsigned int filter_index;
static void usage(void)
{
fprintf(stderr,
"Usage: bridge fdb { add | append | del | replace } ADDR dev DEV\n"
" [ self ] [ master ] [ use ] [ router ] [ extern_learn ]\n"
" [ sticky ] [ local | static | dynamic ] [ vlan VID ]\n"
" { [ dst IPADDR ] [ port PORT] [ vni VNI ] | [ nhid NHID ] }\n"
" [ via DEV ] [ src_vni VNI ]\n"
" bridge fdb [ show [ br BRDEV ] [ brport DEV ] [ vlan VID ]\n"
" [ state STATE ] [ dynamic ] ]\n"
" bridge fdb get [ to ] LLADDR [ br BRDEV ] { brport | dev } DEV\n"
" [ vlan VID ] [ vni VNI ] [ self ] [ master ] [ dynamic ]\n");
fprintf(stderr, "Usage: bridge fdb { add | append | del | replace } ADDR dev DEV {self|master} [ temp ]\n"
" [router] [ dst IPADDR] [ vlan VID ]\n"
" [ port PORT] [ vni VNI ] [via DEV]\n");
fprintf(stderr, " bridge fdb {show} [ br BRDEV ] [ brport DEV ]\n");
exit(-1);
}
static const char *state_n2a(unsigned int s)
static const char *state_n2a(unsigned s)
{
static char buf[32];
@ -64,87 +54,21 @@ static const char *state_n2a(unsigned int s)
if (s & NUD_REACHABLE)
return "";
if (is_json_context())
sprintf(buf, "%#x", s);
else
sprintf(buf, "state=%#x", s);
sprintf(buf, "state=%#x", s);
return buf;
}
static int state_a2n(unsigned int *s, const char *arg)
{
if (matches(arg, "permanent") == 0)
*s = NUD_PERMANENT;
else if (matches(arg, "static") == 0 || matches(arg, "temp") == 0)
*s = NUD_NOARP;
else if (matches(arg, "stale") == 0)
*s = NUD_STALE;
else if (matches(arg, "reachable") == 0 || matches(arg, "dynamic") == 0)
*s = NUD_REACHABLE;
else if (strcmp(arg, "all") == 0)
*s = ~0;
else if (get_unsigned(s, arg, 0))
return -1;
return 0;
}
static void fdb_print_flags(FILE *fp, unsigned int flags)
{
open_json_array(PRINT_JSON,
is_json_context() ? "flags" : "");
if (flags & NTF_SELF)
print_string(PRINT_ANY, NULL, "%s ", "self");
if (flags & NTF_ROUTER)
print_string(PRINT_ANY, NULL, "%s ", "router");
if (flags & NTF_EXT_LEARNED)
print_string(PRINT_ANY, NULL, "%s ", "extern_learn");
if (flags & NTF_OFFLOADED)
print_string(PRINT_ANY, NULL, "%s ", "offload");
if (flags & NTF_MASTER)
print_string(PRINT_ANY, NULL, "%s ", "master");
if (flags & NTF_STICKY)
print_string(PRINT_ANY, NULL, "%s ", "sticky");
close_json_array(PRINT_JSON, NULL);
}
static void fdb_print_stats(FILE *fp, const struct nda_cacheinfo *ci)
{
static int hz;
if (!hz)
hz = get_user_hz();
if (is_json_context()) {
print_uint(PRINT_JSON, "used", NULL,
ci->ndm_used / hz);
print_uint(PRINT_JSON, "updated", NULL,
ci->ndm_updated / hz);
} else {
fprintf(fp, "used %d/%d ", ci->ndm_used / hz,
ci->ndm_updated / hz);
}
}
int print_fdb(struct nlmsghdr *n, void *arg)
int print_fdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
{
FILE *fp = arg;
struct ndmsg *r = NLMSG_DATA(n);
int len = n->nlmsg_len;
struct rtattr *tb[NDA_MAX+1];
__u16 vid = 0;
struct rtattr * tb[NDA_MAX+1];
if (n->nlmsg_type != RTM_NEWNEIGH && n->nlmsg_type != RTM_DELNEIGH) {
fprintf(stderr, "Not RTM_NEWNEIGH: %08x %08x %08x\n",
n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
return 0;
}
@ -160,165 +84,102 @@ int print_fdb(struct nlmsghdr *n, void *arg)
if (filter_index && filter_index != r->ndm_ifindex)
return 0;
if (filter_state && !(r->ndm_state & filter_state))
return 0;
parse_rtattr(tb, NDA_MAX, NDA_RTA(r),
n->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
if (tb[NDA_VLAN])
vid = rta_getattr_u16(tb[NDA_VLAN]);
if (filter_vlan && filter_vlan != vid)
return 0;
if (filter_dynamic && (r->ndm_state & NUD_PERMANENT))
return 0;
open_json_object(NULL);
if (n->nlmsg_type == RTM_DELNEIGH)
print_bool(PRINT_ANY, "deleted", "Deleted ", true);
fprintf(fp, "Deleted ");
if (tb[NDA_LLADDR]) {
const char *lladdr;
SPRINT_BUF(b1);
lladdr = ll_addr_n2a(RTA_DATA(tb[NDA_LLADDR]),
RTA_PAYLOAD(tb[NDA_LLADDR]),
ll_index_to_type(r->ndm_ifindex),
b1, sizeof(b1));
print_color_string(PRINT_ANY, COLOR_MAC,
"mac", "%s ", lladdr);
fprintf(fp, "%s ",
ll_addr_n2a(RTA_DATA(tb[NDA_LLADDR]),
RTA_PAYLOAD(tb[NDA_LLADDR]),
ll_index_to_type(r->ndm_ifindex),
b1, sizeof(b1)));
}
if (!filter_index && r->ndm_ifindex) {
print_string(PRINT_FP, NULL, "dev ", NULL);
print_color_string(PRINT_ANY, COLOR_IFNAME,
"ifname", "%s ",
ll_index_to_name(r->ndm_ifindex));
}
if (!filter_index && r->ndm_ifindex)
fprintf(fp, "dev %s ", ll_index_to_name(r->ndm_ifindex));
if (tb[NDA_DST]) {
SPRINT_BUF(abuf);
int family = AF_INET;
const char *dst;
if (RTA_PAYLOAD(tb[NDA_DST]) == sizeof(struct in6_addr))
family = AF_INET6;
dst = format_host(family,
RTA_PAYLOAD(tb[NDA_DST]),
RTA_DATA(tb[NDA_DST]));
print_string(PRINT_FP, NULL, "dst ", NULL);
print_color_string(PRINT_ANY,
ifa_family_color(family),
"dst", "%s ", dst);
fprintf(fp, "dst %s ",
format_host(family,
RTA_PAYLOAD(tb[NDA_DST]),
RTA_DATA(tb[NDA_DST]),
abuf, sizeof(abuf)));
}
if (vid)
print_uint(PRINT_ANY,
"vlan", "vlan %hu ", vid);
if (tb[NDA_VLAN]) {
__u16 vid = rta_getattr_u16(tb[NDA_VLAN]);
fprintf(fp, "vlan %hu ", vid);
}
if (tb[NDA_PORT])
print_uint(PRINT_ANY,
"port", "port %u ",
rta_getattr_be16(tb[NDA_PORT]));
fprintf(fp, "port %d ", ntohs(rta_getattr_u16(tb[NDA_PORT])));
if (tb[NDA_VNI])
print_uint(PRINT_ANY,
"vni", "vni %u ",
rta_getattr_u32(tb[NDA_VNI]));
if (tb[NDA_SRC_VNI])
print_uint(PRINT_ANY,
"src_vni", "src_vni %u ",
rta_getattr_u32(tb[NDA_SRC_VNI]));
fprintf(fp, "vni %d ", rta_getattr_u32(tb[NDA_VNI]));
if (tb[NDA_IFINDEX]) {
unsigned int ifindex = rta_getattr_u32(tb[NDA_IFINDEX]);
if (tb[NDA_LINK_NETNSID])
print_uint(PRINT_ANY,
"viaIfIndex", "via ifindex %u ",
ifindex);
else
print_string(PRINT_ANY,
"viaIf", "via %s ",
ll_index_to_name(ifindex));
if (ifindex) {
char ifname[IF_NAMESIZE];
if (!tb[NDA_LINK_NETNSID] &&
if_indextoname(ifindex, ifname))
fprintf(fp, "via %s ", ifname);
else
fprintf(fp, "via ifindex %u ", ifindex);
}
}
if (tb[NDA_NH_ID])
print_uint(PRINT_ANY, "nhid", "nhid %u ",
rta_getattr_u32(tb[NDA_NH_ID]));
if (tb[NDA_LINK_NETNSID])
print_uint(PRINT_ANY,
"linkNetNsId", "link-netnsid %d ",
rta_getattr_u32(tb[NDA_LINK_NETNSID]));
if (show_stats && tb[NDA_CACHEINFO])
fdb_print_stats(fp, RTA_DATA(tb[NDA_CACHEINFO]));
fdb_print_flags(fp, r->ndm_flags);
fprintf(fp, "link-netnsid %d ",
rta_getattr_u32(tb[NDA_LINK_NETNSID]));
if (show_stats && tb[NDA_CACHEINFO]) {
struct nda_cacheinfo *ci = RTA_DATA(tb[NDA_CACHEINFO]);
int hz = get_user_hz();
fprintf(fp, "used %d/%d ", ci->ndm_used/hz,
ci->ndm_updated/hz);
}
if (r->ndm_flags & NTF_SELF)
fprintf(fp, "self ");
if (tb[NDA_MASTER])
print_string(PRINT_ANY, "master", "master %s ",
ll_index_to_name(rta_getattr_u32(tb[NDA_MASTER])));
print_string(PRINT_ANY, "state", "%s\n",
state_n2a(r->ndm_state));
close_json_object();
fflush(fp);
return 0;
}
static int fdb_linkdump_filter(struct nlmsghdr *nlh, int reqlen)
{
int err;
if (filter_index) {
struct ifinfomsg *ifm = NLMSG_DATA(nlh);
ifm->ifi_index = filter_index;
}
if (filter_master) {
err = addattr32(nlh, reqlen, IFLA_MASTER, filter_master);
if (err)
return err;
}
return 0;
}
static int fdb_dump_filter(struct nlmsghdr *nlh, int reqlen)
{
int err;
if (filter_index) {
struct ndmsg *ndm = NLMSG_DATA(nlh);
ndm->ndm_ifindex = filter_index;
}
if (filter_master) {
err = addattr32(nlh, reqlen, NDA_MASTER, filter_master);
if (err)
return err;
}
fprintf(fp, "master %s ",
ll_index_to_name(rta_getattr_u32(tb[NDA_MASTER])));
else if (r->ndm_flags & NTF_MASTER)
fprintf(fp, "master ");
if (r->ndm_flags & NTF_ROUTER)
fprintf(fp, "router ");
if (r->ndm_flags & NTF_EXT_LEARNED)
fprintf(fp, "offload ");
fprintf(fp, "%s\n", state_n2a(r->ndm_state));
return 0;
}
static int fdb_show(int argc, char **argv)
{
struct {
struct nlmsghdr n;
struct ifinfomsg ifm;
char buf[256];
} req;
char *filter_dev = NULL;
char *br = NULL;
int rc;
int msg_size = sizeof(struct ifinfomsg);
memset(&req, 0, sizeof(req));
req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
req.ifm.ifi_family = PF_BRIDGE;
while (argc > 0) {
if ((strcmp(*argv, "brport") == 0) || strcmp(*argv, "dev") == 0) {
@ -327,20 +188,6 @@ static int fdb_show(int argc, char **argv)
} else if (strcmp(*argv, "br") == 0) {
NEXT_ARG();
br = *argv;
} else if (strcmp(*argv, "vlan") == 0) {
NEXT_ARG();
if (filter_vlan)
duparg("vlan", *argv);
filter_vlan = atoi(*argv);
} else if (strcmp(*argv, "state") == 0) {
unsigned int state;
NEXT_ARG();
if (state_a2n(&state, *argv))
invarg("invalid state", *argv);
filter_state |= state;
} else if (strcmp(*argv, "dynamic") == 0) {
filter_dynamic = 1;
} else {
if (matches(*argv, "help") == 0)
usage();
@ -350,37 +197,34 @@ static int fdb_show(int argc, char **argv)
if (br) {
int br_ifindex = ll_name_to_index(br);
if (br_ifindex == 0) {
fprintf(stderr, "Cannot find bridge device \"%s\"\n", br);
return -1;
}
filter_master = br_ifindex;
addattr32(&req.n, sizeof(req), IFLA_MASTER, br_ifindex);
msg_size += RTA_LENGTH(4);
}
/*we'll keep around filter_dev for older kernels */
if (filter_dev) {
filter_index = ll_name_to_index(filter_dev);
if (!filter_index)
return nodev(filter_dev);
filter_index = if_nametoindex(filter_dev);
if (filter_index == 0) {
fprintf(stderr, "Cannot find device \"%s\"\n",
filter_dev);
return -1;
}
req.ifm.ifi_index = filter_index;
}
if (rth.flags & RTNL_HANDLE_F_STRICT_CHK)
rc = rtnl_neighdump_req(&rth, PF_BRIDGE, fdb_dump_filter);
else
rc = rtnl_fdb_linkdump_req_filter_fn(&rth, fdb_linkdump_filter);
if (rc < 0) {
if (rtnl_dump_request(&rth, RTM_GETNEIGH, &req.ifm, msg_size) < 0) {
perror("Cannot send dump request");
exit(1);
}
new_json_obj(json);
if (rtnl_dump_filter(&rth, print_fdb, stdout) < 0) {
fprintf(stderr, "Dump terminated\n");
exit(1);
}
delete_json_obj();
fflush(stdout);
return 0;
}
@ -388,16 +232,10 @@ static int fdb_show(int argc, char **argv)
static int fdb_modify(int cmd, int flags, int argc, char **argv)
{
struct {
struct nlmsghdr n;
struct ndmsg ndm;
char buf[256];
} req = {
.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
.n.nlmsg_flags = NLM_F_REQUEST | flags,
.n.nlmsg_type = cmd,
.ndm.ndm_family = PF_BRIDGE,
.ndm.ndm_state = NUD_NOARP,
};
struct nlmsghdr n;
struct ndmsg ndm;
char buf[256];
} req;
char *addr = NULL;
char *d = NULL;
char abuf[ETH_ALEN];
@ -405,11 +243,17 @@ static int fdb_modify(int cmd, int flags, int argc, char **argv)
inet_prefix dst;
unsigned long port = 0;
unsigned long vni = ~0;
unsigned long src_vni = ~0;
unsigned int via = 0;
char *endptr;
short vid = -1;
__u32 nhid = 0;
memset(&req, 0, sizeof(req));
req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg));
req.n.nlmsg_flags = NLM_F_REQUEST|flags;
req.n.nlmsg_type = cmd;
req.ndm.ndm_family = PF_BRIDGE;
req.ndm.ndm_state = NUD_NOARP;
while (argc > 0) {
if (strcmp(*argv, "dev") == 0) {
@ -421,10 +265,6 @@ static int fdb_modify(int cmd, int flags, int argc, char **argv)
duparg2("dst", *argv);
get_addr(&dst, *argv, preferred_family);
dst_ok = 1;
} else if (strcmp(*argv, "nhid") == 0) {
NEXT_ARG();
if (get_u32(&nhid, *argv, 0))
invarg("\"id\" value is invalid\n", *argv);
} else if (strcmp(*argv, "port") == 0) {
NEXT_ARG();
@ -444,47 +284,31 @@ static int fdb_modify(int cmd, int flags, int argc, char **argv)
if ((endptr && *endptr) ||
(vni >> 24) || vni == ULONG_MAX)
invarg("invalid VNI\n", *argv);
} else if (strcmp(*argv, "src_vni") == 0) {
NEXT_ARG();
src_vni = strtoul(*argv, &endptr, 0);
if ((endptr && *endptr) ||
(src_vni >> 24) || src_vni == ULONG_MAX)
invarg("invalid src VNI\n", *argv);
} else if (strcmp(*argv, "via") == 0) {
NEXT_ARG();
via = ll_name_to_index(*argv);
if (!via)
exit(nodev(*argv));
via = if_nametoindex(*argv);
if (via == 0)
invarg("invalid device\n", *argv);
} else if (strcmp(*argv, "self") == 0) {
req.ndm.ndm_flags |= NTF_SELF;
} else if (matches(*argv, "master") == 0) {
req.ndm.ndm_flags |= NTF_MASTER;
} else if (matches(*argv, "router") == 0) {
req.ndm.ndm_flags |= NTF_ROUTER;
} else if (matches(*argv, "local") == 0 ||
} else if (matches(*argv, "local") == 0||
matches(*argv, "permanent") == 0) {
req.ndm.ndm_state |= NUD_PERMANENT;
} else if (matches(*argv, "temp") == 0 ||
matches(*argv, "static") == 0) {
} else if (matches(*argv, "temp") == 0) {
req.ndm.ndm_state |= NUD_REACHABLE;
} else if (matches(*argv, "dynamic") == 0) {
req.ndm.ndm_state |= NUD_REACHABLE;
req.ndm.ndm_state &= ~NUD_NOARP;
} else if (matches(*argv, "vlan") == 0) {
if (vid >= 0)
duparg2("vlan", *argv);
NEXT_ARG();
vid = atoi(*argv);
} else if (matches(*argv, "use") == 0) {
req.ndm.ndm_flags |= NTF_USE;
} else if (matches(*argv, "extern_learn") == 0) {
req.ndm.ndm_flags |= NTF_EXT_LEARNED;
} else if (matches(*argv, "sticky") == 0) {
req.ndm.ndm_flags |= NTF_STICKY;
} else {
if (strcmp(*argv, "to") == 0)
if (strcmp(*argv, "to") == 0) {
NEXT_ARG();
}
if (matches(*argv, "help") == 0)
usage();
if (addr)
@ -499,11 +323,6 @@ static int fdb_modify(int cmd, int flags, int argc, char **argv)
return -1;
}
if (nhid && (dst_ok || port || vni != ~0)) {
fprintf(stderr, "dst, port, vni are mutually exclusive with nhid\n");
return -1;
}
/* Assume self */
if (!(req.ndm.ndm_flags&(NTF_SELF|NTF_MASTER)))
req.ndm.ndm_flags |= NTF_SELF;
@ -525,8 +344,6 @@ static int fdb_modify(int cmd, int flags, int argc, char **argv)
if (vid >= 0)
addattr16(&req.n, sizeof(req), NDA_VLAN, vid);
if (nhid > 0)
addattr32(&req.n, sizeof(req), NDA_NH_ID, nhid);
if (port) {
unsigned short dport;
@ -536,132 +353,17 @@ static int fdb_modify(int cmd, int flags, int argc, char **argv)
}
if (vni != ~0)
addattr32(&req.n, sizeof(req), NDA_VNI, vni);
if (src_vni != ~0)
addattr32(&req.n, sizeof(req), NDA_SRC_VNI, src_vni);
if (via)
addattr32(&req.n, sizeof(req), NDA_IFINDEX, via);
req.ndm.ndm_ifindex = ll_name_to_index(d);
if (!req.ndm.ndm_ifindex)
return nodev(d);
if (rtnl_talk(&rth, &req.n, NULL) < 0)
return -1;
return 0;
}
static int fdb_get(int argc, char **argv)
{
struct {
struct nlmsghdr n;
struct ndmsg ndm;
char buf[1024];
} req = {
.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
.n.nlmsg_flags = NLM_F_REQUEST,
.n.nlmsg_type = RTM_GETNEIGH,
.ndm.ndm_family = AF_BRIDGE,
};
char *d = NULL, *br = NULL;
struct nlmsghdr *answer;
unsigned long vni = ~0;
char abuf[ETH_ALEN];
int br_ifindex = 0;
char *addr = NULL;
short vlan = -1;
char *endptr;
while (argc > 0) {
if ((strcmp(*argv, "brport") == 0) || strcmp(*argv, "dev") == 0) {
NEXT_ARG();
d = *argv;
} else if (strcmp(*argv, "br") == 0) {
NEXT_ARG();
br = *argv;
} else if (strcmp(*argv, "dev") == 0) {
NEXT_ARG();
d = *argv;
} else if (strcmp(*argv, "vni") == 0) {
NEXT_ARG();
vni = strtoul(*argv, &endptr, 0);
if ((endptr && *endptr) ||
(vni >> 24) || vni == ULONG_MAX)
invarg("invalid VNI\n", *argv);
} else if (strcmp(*argv, "self") == 0) {
req.ndm.ndm_flags |= NTF_SELF;
} else if (matches(*argv, "master") == 0) {
req.ndm.ndm_flags |= NTF_MASTER;
} else if (matches(*argv, "vlan") == 0) {
if (vlan >= 0)
duparg2("vlan", *argv);
NEXT_ARG();
vlan = atoi(*argv);
} else if (matches(*argv, "dynamic") == 0) {
filter_dynamic = 1;
} else {
if (strcmp(*argv, "to") == 0)
NEXT_ARG();
if (matches(*argv, "help") == 0)
usage();
if (addr)
duparg2("to", *argv);
addr = *argv;
}
argc--; argv++;
}
if ((d == NULL && br == NULL) || addr == NULL) {
fprintf(stderr, "Device or master and address are required arguments.\n");
if (req.ndm.ndm_ifindex == 0) {
fprintf(stderr, "Cannot find device \"%s\"\n", d);
return -1;
}
if (sscanf(addr, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
abuf, abuf+1, abuf+2,
abuf+3, abuf+4, abuf+5) != 6) {
fprintf(stderr, "Invalid mac address %s\n", addr);
if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
return -1;
}
addattr_l(&req.n, sizeof(req), NDA_LLADDR, abuf, ETH_ALEN);
if (vlan >= 0)
addattr16(&req.n, sizeof(req), NDA_VLAN, vlan);
if (vni != ~0)
addattr32(&req.n, sizeof(req), NDA_VNI, vni);
if (d) {
req.ndm.ndm_ifindex = ll_name_to_index(d);
if (!req.ndm.ndm_ifindex) {
fprintf(stderr, "Cannot find device \"%s\"\n", d);
return -1;
}
}
if (br) {
br_ifindex = ll_name_to_index(br);
if (!br_ifindex) {
fprintf(stderr, "Cannot find bridge device \"%s\"\n", br);
return -1;
}
addattr32(&req.n, sizeof(req), NDA_MASTER, br_ifindex);
}
if (rtnl_talk(&rth, &req.n, &answer) < 0)
return -2;
/*
* Initialize a json_writer and open an array object
* if -json was specified.
*/
new_json_obj(json);
if (print_fdb(answer, stdout) < 0) {
fprintf(stderr, "An error :-)\n");
return -1;
}
delete_json_obj();
return 0;
}
@ -679,8 +381,6 @@ int do_fdb(int argc, char **argv)
return fdb_modify(RTM_NEWNEIGH, NLM_F_CREATE|NLM_F_REPLACE, argc-1, argv+1);
if (matches(*argv, "delete") == 0)
return fdb_modify(RTM_DELNEIGH, 0, argc-1, argv+1);
if (matches(*argv, "get") == 0)
return fdb_get(argc-1, argv+1);
if (matches(*argv, "show") == 0 ||
matches(*argv, "lst") == 0 ||
matches(*argv, "list") == 0)

View File

@ -1,4 +1,3 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <stdio.h>
#include <stdlib.h>
@ -12,14 +11,13 @@
#include <string.h>
#include <stdbool.h>
#include "json_print.h"
#include "libnetlink.h"
#include "utils.h"
#include "br_common.h"
static unsigned int filter_index;
static const char *stp_states[] = {
static const char *port_states[] = {
[BR_STATE_DISABLED] = "disabled",
[BR_STATE_LISTENING] = "listening",
[BR_STATE_LEARNING] = "learning",
@ -27,21 +25,17 @@ static const char *stp_states[] = {
[BR_STATE_BLOCKING] = "blocking",
};
static const char *hw_mode[] = {
"VEB", "VEPA"
};
extern char *if_indextoname (unsigned int __ifindex, char *__ifname);
static void print_link_flags(FILE *fp, unsigned int flags, unsigned int mdown)
static void print_link_flags(FILE *fp, unsigned flags)
{
open_json_array(PRINT_ANY, is_json_context() ? "flags" : "<");
fprintf(fp, "<");
if (flags & IFF_UP && !(flags & IFF_RUNNING))
print_string(PRINT_ANY, NULL,
flags ? "%s," : "%s", "NO-CARRIER");
fprintf(fp, "NO-CARRIER%s", flags ? "," : "");
flags &= ~IFF_RUNNING;
#define _PF(f) if (flags&IFF_##f) { \
flags &= ~IFF_##f ; \
print_string(PRINT_ANY, NULL, flags ? "%s," : "%s", #f); }
#define _PF(f) if (flags&IFF_##f) { \
flags &= ~IFF_##f ; \
fprintf(fp, #f "%s", flags ? "," : ""); }
_PF(LOOPBACK);
_PF(BROADCAST);
_PF(POINTOPOINT);
@ -61,159 +55,61 @@ static void print_link_flags(FILE *fp, unsigned int flags, unsigned int mdown)
_PF(DORMANT);
_PF(ECHO);
#undef _PF
if (flags)
print_hex(PRINT_ANY, NULL, "%x", flags);
if (mdown)
print_string(PRINT_ANY, NULL, ",%s", "M-DOWN");
close_json_array(PRINT_ANY, "> ");
if (flags)
fprintf(fp, "%x", flags);
fprintf(fp, "> ");
}
void print_stp_state(__u8 state)
static const char *oper_states[] = {
"UNKNOWN", "NOTPRESENT", "DOWN", "LOWERLAYERDOWN",
"TESTING", "DORMANT", "UP"
};
static const char *hw_mode[] = {"VEB", "VEPA"};
static void print_operstate(FILE *f, __u8 state)
{
if (state >= sizeof(oper_states)/sizeof(oper_states[0]))
fprintf(f, "state %#x ", state);
else
fprintf(f, "state %s ", oper_states[state]);
}
static void print_portstate(FILE *f, __u8 state)
{
if (state <= BR_STATE_BLOCKING)
print_string(PRINT_ANY, "state",
"state %s ", stp_states[state]);
fprintf(f, "state %s ", port_states[state]);
else
print_uint(PRINT_ANY, "state",
"state (%d) ", state);
fprintf(f, "state (%d) ", state);
}
int parse_stp_state(const char *arg)
static void print_onoff(FILE *f, char *flag, __u8 val)
{
size_t nstates = ARRAY_SIZE(stp_states);
int state;
for (state = 0; state < nstates; state++)
if (strcmp(stp_states[state], arg) == 0)
break;
if (state == nstates)
state = -1;
return state;
fprintf(f, "%s %s ", flag, val ? "on" : "off");
}
static void print_hwmode(__u16 mode)
static void print_hwmode(FILE *f, __u16 mode)
{
if (mode >= ARRAY_SIZE(hw_mode))
print_0xhex(PRINT_ANY, "hwmode",
"hwmode %#llx ", mode);
if (mode >= sizeof(hw_mode)/sizeof(hw_mode[0]))
fprintf(f, "hwmode %#hx ", mode);
else
print_string(PRINT_ANY, "hwmode",
"hwmode %s ", hw_mode[mode]);
fprintf(f, "hwmode %s ", hw_mode[mode]);
}
static void print_protinfo(FILE *fp, struct rtattr *attr)
{
if (attr->rta_type & NLA_F_NESTED) {
struct rtattr *prtb[IFLA_BRPORT_MAX + 1];
parse_rtattr_nested(prtb, IFLA_BRPORT_MAX, attr);
if (prtb[IFLA_BRPORT_STATE])
print_stp_state(rta_getattr_u8(prtb[IFLA_BRPORT_STATE]));
if (prtb[IFLA_BRPORT_PRIORITY])
print_uint(PRINT_ANY, "priority",
"priority %u ",
rta_getattr_u16(prtb[IFLA_BRPORT_PRIORITY]));
if (prtb[IFLA_BRPORT_COST])
print_uint(PRINT_ANY, "cost",
"cost %u ",
rta_getattr_u32(prtb[IFLA_BRPORT_COST]));
if (!show_details)
return;
if (!is_json_context())
fprintf(fp, "%s ", _SL_);
if (prtb[IFLA_BRPORT_MODE])
print_on_off(PRINT_ANY, "hairpin", "hairpin %s ",
rta_getattr_u8(prtb[IFLA_BRPORT_MODE]));
if (prtb[IFLA_BRPORT_GUARD])
print_on_off(PRINT_ANY, "guard", "guard %s ",
rta_getattr_u8(prtb[IFLA_BRPORT_GUARD]));
if (prtb[IFLA_BRPORT_PROTECT])
print_on_off(PRINT_ANY, "root_block", "root_block %s ",
rta_getattr_u8(prtb[IFLA_BRPORT_PROTECT]));
if (prtb[IFLA_BRPORT_FAST_LEAVE])
print_on_off(PRINT_ANY, "fastleave", "fastleave %s ",
rta_getattr_u8(prtb[IFLA_BRPORT_FAST_LEAVE]));
if (prtb[IFLA_BRPORT_LEARNING])
print_on_off(PRINT_ANY, "learning", "learning %s ",
rta_getattr_u8(prtb[IFLA_BRPORT_LEARNING]));
if (prtb[IFLA_BRPORT_LEARNING_SYNC])
print_on_off(PRINT_ANY, "learning_sync", "learning_sync %s ",
rta_getattr_u8(prtb[IFLA_BRPORT_LEARNING_SYNC]));
if (prtb[IFLA_BRPORT_UNICAST_FLOOD])
print_on_off(PRINT_ANY, "flood", "flood %s ",
rta_getattr_u8(prtb[IFLA_BRPORT_UNICAST_FLOOD]));
if (prtb[IFLA_BRPORT_MCAST_FLOOD])
print_on_off(PRINT_ANY, "mcast_flood", "mcast_flood %s ",
rta_getattr_u8(prtb[IFLA_BRPORT_MCAST_FLOOD]));
if (prtb[IFLA_BRPORT_MCAST_TO_UCAST])
print_on_off(PRINT_ANY, "mcast_to_unicast", "mcast_to_unicast %s ",
rta_getattr_u8(prtb[IFLA_BRPORT_MCAST_TO_UCAST]));
if (prtb[IFLA_BRPORT_NEIGH_SUPPRESS])
print_on_off(PRINT_ANY, "neigh_suppress", "neigh_suppress %s ",
rta_getattr_u8(prtb[IFLA_BRPORT_NEIGH_SUPPRESS]));
if (prtb[IFLA_BRPORT_VLAN_TUNNEL])
print_on_off(PRINT_ANY, "vlan_tunnel", "vlan_tunnel %s ",
rta_getattr_u8(prtb[IFLA_BRPORT_VLAN_TUNNEL]));
if (prtb[IFLA_BRPORT_BACKUP_PORT]) {
int ifidx;
ifidx = rta_getattr_u32(prtb[IFLA_BRPORT_BACKUP_PORT]);
print_string(PRINT_ANY,
"backup_port", "backup_port %s ",
ll_index_to_name(ifidx));
}
if (prtb[IFLA_BRPORT_ISOLATED])
print_on_off(PRINT_ANY, "isolated", "isolated %s ",
rta_getattr_u8(prtb[IFLA_BRPORT_ISOLATED]));
} else
print_stp_state(rta_getattr_u8(attr));
}
/*
* This is reported by HW devices that have some bridging
* capabilities.
*/
static void print_af_spec(struct rtattr *attr, int ifindex)
{
struct rtattr *aftb[IFLA_BRIDGE_MAX+1];
parse_rtattr_nested(aftb, IFLA_BRIDGE_MAX, attr);
if (aftb[IFLA_BRIDGE_MODE])
print_hwmode(rta_getattr_u16(aftb[IFLA_BRIDGE_MODE]));
if (!show_details)
return;
if (aftb[IFLA_BRIDGE_VLAN_INFO])
print_vlan_info(aftb[IFLA_BRIDGE_VLAN_INFO], ifindex);
}
int print_linkinfo(struct nlmsghdr *n, void *arg)
int print_linkinfo(const struct sockaddr_nl *who,
struct nlmsghdr *n, void *arg)
{
FILE *fp = arg;
struct ifinfomsg *ifi = NLMSG_DATA(n);
struct rtattr *tb[IFLA_MAX+1];
unsigned int m_flag = 0;
int len = n->nlmsg_len;
const char *name;
struct ifinfomsg *ifi = NLMSG_DATA(n);
struct rtattr * tb[IFLA_MAX+1];
char b1[IFNAMSIZ];
len -= NLMSG_LENGTH(sizeof(*ifi));
if (len < 0) {
fprintf(stderr, "Message too short!\n");
return -1;
}
}
if (!(ifi->ifi_family == AF_BRIDGE || ifi->ifi_family == AF_UNSPEC))
return 0;
@ -223,87 +119,146 @@ int print_linkinfo(struct nlmsghdr *n, void *arg)
parse_rtattr_flags(tb, IFLA_MAX, IFLA_RTA(ifi), len, NLA_F_NESTED);
name = get_ifname_rta(ifi->ifi_index, tb[IFLA_IFNAME]);
if (!name)
if (tb[IFLA_IFNAME] == NULL) {
fprintf(stderr, "BUG: nil ifname\n");
return -1;
open_json_object(NULL);
if (n->nlmsg_type == RTM_DELLINK)
print_bool(PRINT_ANY, "deleted", "Deleted ", true);
print_int(PRINT_ANY, "ifindex", "%d: ", ifi->ifi_index);
m_flag = print_name_and_link("%s: ", name, tb);
print_link_flags(fp, ifi->ifi_flags, m_flag);
if (tb[IFLA_MTU])
print_int(PRINT_ANY,
"mtu", "mtu %u ",
rta_getattr_u32(tb[IFLA_MTU]));
if (tb[IFLA_MASTER]) {
int master = rta_getattr_u32(tb[IFLA_MASTER]);
print_string(PRINT_ANY, "master", "master %s ",
ll_index_to_name(master));
}
if (tb[IFLA_PROTINFO])
print_protinfo(fp, tb[IFLA_PROTINFO]);
if (n->nlmsg_type == RTM_DELLINK)
fprintf(fp, "Deleted ");
if (tb[IFLA_AF_SPEC])
print_af_spec(tb[IFLA_AF_SPEC], ifi->ifi_index);
fprintf(fp, "%d: %s ", ifi->ifi_index,
tb[IFLA_IFNAME] ? rta_getattr_str(tb[IFLA_IFNAME]) : "<nil>");
print_string(PRINT_FP, NULL, "%s", "\n");
close_json_object();
if (tb[IFLA_OPERSTATE])
print_operstate(fp, rta_getattr_u8(tb[IFLA_OPERSTATE]));
if (tb[IFLA_LINK]) {
SPRINT_BUF(b1);
int iflink = rta_getattr_u32(tb[IFLA_LINK]);
if (iflink == 0)
fprintf(fp, "@NONE: ");
else
fprintf(fp, "@%s: ",
if_indextoname(iflink, b1));
} else
fprintf(fp, ": ");
print_link_flags(fp, ifi->ifi_flags);
if (tb[IFLA_MTU])
fprintf(fp, "mtu %u ", rta_getattr_u32(tb[IFLA_MTU]));
if (tb[IFLA_MASTER])
fprintf(fp, "master %s ",
if_indextoname(rta_getattr_u32(tb[IFLA_MASTER]), b1));
if (tb[IFLA_PROTINFO]) {
if (tb[IFLA_PROTINFO]->rta_type & NLA_F_NESTED) {
struct rtattr *prtb[IFLA_BRPORT_MAX+1];
parse_rtattr_nested(prtb, IFLA_BRPORT_MAX,
tb[IFLA_PROTINFO]);
if (prtb[IFLA_BRPORT_STATE])
print_portstate(fp,
rta_getattr_u8(prtb[IFLA_BRPORT_STATE]));
if (prtb[IFLA_BRPORT_PRIORITY])
fprintf(fp, "priority %hu ",
rta_getattr_u16(prtb[IFLA_BRPORT_PRIORITY]));
if (prtb[IFLA_BRPORT_COST])
fprintf(fp, "cost %u ",
rta_getattr_u32(prtb[IFLA_BRPORT_COST]));
if (show_details) {
fprintf(fp, "%s ", _SL_);
if (prtb[IFLA_BRPORT_MODE])
print_onoff(fp, "hairpin",
rta_getattr_u8(prtb[IFLA_BRPORT_MODE]));
if (prtb[IFLA_BRPORT_GUARD])
print_onoff(fp, "guard",
rta_getattr_u8(prtb[IFLA_BRPORT_GUARD]));
if (prtb[IFLA_BRPORT_PROTECT])
print_onoff(fp, "root_block",
rta_getattr_u8(prtb[IFLA_BRPORT_PROTECT]));
if (prtb[IFLA_BRPORT_FAST_LEAVE])
print_onoff(fp, "fastleave",
rta_getattr_u8(prtb[IFLA_BRPORT_FAST_LEAVE]));
if (prtb[IFLA_BRPORT_LEARNING])
print_onoff(fp, "learning",
rta_getattr_u8(prtb[IFLA_BRPORT_LEARNING]));
if (prtb[IFLA_BRPORT_LEARNING_SYNC])
print_onoff(fp, "learning_sync",
rta_getattr_u8(prtb[IFLA_BRPORT_LEARNING_SYNC]));
if (prtb[IFLA_BRPORT_UNICAST_FLOOD])
print_onoff(fp, "flood",
rta_getattr_u8(prtb[IFLA_BRPORT_UNICAST_FLOOD]));
}
} else
print_portstate(fp, rta_getattr_u8(tb[IFLA_PROTINFO]));
}
if (tb[IFLA_AF_SPEC]) {
/* This is reported by HW devices that have some bridging
* capabilities.
*/
struct rtattr *aftb[IFLA_BRIDGE_MAX+1];
parse_rtattr_nested(aftb, IFLA_BRIDGE_MAX, tb[IFLA_AF_SPEC]);
if (aftb[IFLA_BRIDGE_MODE])
print_hwmode(fp, rta_getattr_u16(aftb[IFLA_BRIDGE_MODE]));
}
fprintf(fp, "\n");
fflush(fp);
return 0;
}
static void usage(void)
{
fprintf(stderr,
"Usage: bridge link set dev DEV [ cost COST ] [ priority PRIO ] [ state STATE ]\n"
" [ guard {on | off} ]\n"
" [ hairpin {on | off} ]\n"
" [ fastleave {on | off} ]\n"
" [ root_block {on | off} ]\n"
" [ learning {on | off} ]\n"
" [ learning_sync {on | off} ]\n"
" [ flood {on | off} ]\n"
" [ mcast_flood {on | off} ]\n"
" [ mcast_to_unicast {on | off} ]\n"
" [ neigh_suppress {on | off} ]\n"
" [ vlan_tunnel {on | off} ]\n"
" [ isolated {on | off} ]\n"
" [ hwmode {vepa | veb} ]\n"
" [ backup_port DEVICE ] [ nobackup_port ]\n"
" [ self ] [ master ]\n"
" bridge link show [dev DEV]\n");
fprintf(stderr, "Usage: bridge link set dev DEV [ cost COST ] [ priority PRIO ] [ state STATE ]\n");
fprintf(stderr, " [ guard {on | off} ]\n");
fprintf(stderr, " [ hairpin {on | off} ] \n");
fprintf(stderr, " [ fastleave {on | off} ]\n");
fprintf(stderr, " [ root_block {on | off} ]\n");
fprintf(stderr, " [ learning {on | off} ]\n");
fprintf(stderr, " [ learning_sync {on | off} ]\n");
fprintf(stderr, " [ flood {on | off} ]\n");
fprintf(stderr, " [ hwmode {vepa | veb} ]\n");
fprintf(stderr, " [ self ] [ master ]\n");
fprintf(stderr, " bridge link show [dev DEV]\n");
exit(-1);
}
static bool on_off(char *arg, __s8 *attr, char *val)
{
if (strcmp(val, "on") == 0)
*attr = 1;
else if (strcmp(val, "off") == 0)
*attr = 0;
else {
fprintf(stderr,
"Error: argument of \"%s\" must be \"on\" or \"off\"\n",
arg);
return false;
}
return true;
}
static int brlink_modify(int argc, char **argv)
{
struct {
struct nlmsghdr n;
struct ifinfomsg ifm;
char buf[512];
} req = {
.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
.n.nlmsg_flags = NLM_F_REQUEST,
.n.nlmsg_type = RTM_SETLINK,
.ifm.ifi_family = PF_BRIDGE,
};
} req;
char *d = NULL;
int backup_port_idx = -1;
__s8 neigh_suppress = -1;
__s8 learning = -1;
__s8 learning_sync = -1;
__s8 flood = -1;
__s8 vlan_tunnel = -1;
__s8 mcast_flood = -1;
__s8 mcast_to_unicast = -1;
__s8 isolated = -1;
__s8 hairpin = -1;
__s8 bpdu_guard = -1;
__s8 fast_leave = -1;
@ -314,7 +269,13 @@ static int brlink_modify(int argc, char **argv)
__s16 mode = -1;
__u16 flags = 0;
struct rtattr *nest;
int ret;
memset(&req, 0, sizeof(req));
req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
req.n.nlmsg_flags = NLM_F_REQUEST;
req.n.nlmsg_type = RTM_SETLINK;
req.ifm.ifi_family = PF_BRIDGE;
while (argc > 0) {
if (strcmp(*argv, "dev") == 0) {
@ -322,49 +283,32 @@ static int brlink_modify(int argc, char **argv)
d = *argv;
} else if (strcmp(*argv, "guard") == 0) {
NEXT_ARG();
bpdu_guard = parse_on_off("guard", *argv, &ret);
if (ret)
return ret;
if (!on_off("guard", &bpdu_guard, *argv))
return -1;
} else if (strcmp(*argv, "hairpin") == 0) {
NEXT_ARG();
hairpin = parse_on_off("hairpin", *argv, &ret);
if (ret)
return ret;
if (!on_off("hairping", &hairpin, *argv))
return -1;
} else if (strcmp(*argv, "fastleave") == 0) {
NEXT_ARG();
fast_leave = parse_on_off("fastleave", *argv, &ret);
if (ret)
return ret;
if (!on_off("fastleave", &fast_leave, *argv))
return -1;
} else if (strcmp(*argv, "root_block") == 0) {
NEXT_ARG();
root_block = parse_on_off("root_block", *argv, &ret);
if (ret)
return ret;
if (!on_off("root_block", &root_block, *argv))
return -1;
} else if (strcmp(*argv, "learning") == 0) {
NEXT_ARG();
learning = parse_on_off("learning", *argv, &ret);
if (ret)
return ret;
if (!on_off("learning", &learning, *argv))
return -1;
} else if (strcmp(*argv, "learning_sync") == 0) {
NEXT_ARG();
learning_sync = parse_on_off("learning_sync", *argv, &ret);
if (ret)
return ret;
if (!on_off("learning_sync", &learning_sync, *argv))
return -1;
} else if (strcmp(*argv, "flood") == 0) {
NEXT_ARG();
flood = parse_on_off("flood", *argv, &ret);
if (ret)
return ret;
} else if (strcmp(*argv, "mcast_flood") == 0) {
NEXT_ARG();
mcast_flood = parse_on_off("mcast_flood", *argv, &ret);
if (ret)
return ret;
} else if (strcmp(*argv, "mcast_to_unicast") == 0) {
NEXT_ARG();
mcast_to_unicast = parse_on_off("mcast_to_unicast", *argv, &ret);
if (ret)
return ret;
if (!on_off("flood", &flood, *argv))
return -1;
} else if (strcmp(*argv, "cost") == 0) {
NEXT_ARG();
cost = atoi(*argv);
@ -374,11 +318,13 @@ static int brlink_modify(int argc, char **argv)
} else if (strcmp(*argv, "state") == 0) {
NEXT_ARG();
char *endptr;
size_t nstates = sizeof(port_states) / sizeof(*port_states);
state = strtol(*argv, &endptr, 10);
if (!(**argv != '\0' && *endptr == '\0')) {
state = parse_stp_state(*argv);
if (state == -1) {
for (state = 0; state < nstates; state++)
if (strcmp(port_states[state], *argv) == 0)
break;
if (state == nstates) {
fprintf(stderr,
"Error: invalid STP port state\n");
return -1;
@ -393,38 +339,14 @@ static int brlink_modify(int argc, char **argv)
mode = BRIDGE_MODE_VEB;
else {
fprintf(stderr,
"Mode argument must be \"vepa\" or \"veb\".\n");
"Mode argument must be \"vepa\" or "
"\"veb\".\n");
return -1;
}
} else if (strcmp(*argv, "self") == 0) {
flags |= BRIDGE_FLAGS_SELF;
} else if (strcmp(*argv, "master") == 0) {
flags |= BRIDGE_FLAGS_MASTER;
} else if (strcmp(*argv, "neigh_suppress") == 0) {
NEXT_ARG();
neigh_suppress = parse_on_off("neigh_suppress", *argv, &ret);
if (ret)
return ret;
} else if (strcmp(*argv, "vlan_tunnel") == 0) {
NEXT_ARG();
vlan_tunnel = parse_on_off("vlan_tunnel", *argv, &ret);
if (ret)
return ret;
} else if (strcmp(*argv, "isolated") == 0) {
NEXT_ARG();
isolated = parse_on_off("isolated", *argv, &ret);
if (ret)
return ret;
} else if (strcmp(*argv, "backup_port") == 0) {
NEXT_ARG();
backup_port_idx = ll_name_to_index(*argv);
if (!backup_port_idx) {
fprintf(stderr, "Error: device %s does not exist\n",
*argv);
return -1;
}
} else if (strcmp(*argv, "nobackup_port") == 0) {
backup_port_idx = 0;
} else {
usage();
}
@ -459,12 +381,6 @@ static int brlink_modify(int argc, char **argv)
addattr8(&req.n, sizeof(req), IFLA_BRPORT_PROTECT, root_block);
if (flood >= 0)
addattr8(&req.n, sizeof(req), IFLA_BRPORT_UNICAST_FLOOD, flood);
if (mcast_flood >= 0)
addattr8(&req.n, sizeof(req), IFLA_BRPORT_MCAST_FLOOD,
mcast_flood);
if (mcast_to_unicast >= 0)
addattr8(&req.n, sizeof(req), IFLA_BRPORT_MCAST_TO_UCAST,
mcast_to_unicast);
if (learning >= 0)
addattr8(&req.n, sizeof(req), IFLA_BRPORT_LEARNING, learning);
if (learning_sync >= 0)
@ -480,19 +396,6 @@ static int brlink_modify(int argc, char **argv)
if (state >= 0)
addattr8(&req.n, sizeof(req), IFLA_BRPORT_STATE, state);
if (neigh_suppress != -1)
addattr8(&req.n, sizeof(req), IFLA_BRPORT_NEIGH_SUPPRESS,
neigh_suppress);
if (vlan_tunnel != -1)
addattr8(&req.n, sizeof(req), IFLA_BRPORT_VLAN_TUNNEL,
vlan_tunnel);
if (isolated != -1)
addattr8(&req.n, sizeof(req), IFLA_BRPORT_ISOLATED, isolated);
if (backup_port_idx != -1)
addattr32(&req.n, sizeof(req), IFLA_BRPORT_BACKUP_PORT,
backup_port_idx);
addattr_nest_end(&req.n, nest);
/* IFLA_AF_SPEC nested attribute. Contains IFLA_BRIDGE_FLAGS that
@ -512,7 +415,7 @@ static int brlink_modify(int argc, char **argv)
addattr_nest_end(&req.n, nest);
}
if (rtnl_talk(&rth, &req.n, NULL) < 0)
if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
return -1;
return 0;
@ -533,34 +436,22 @@ static int brlink_show(int argc, char **argv)
}
if (filter_dev) {
filter_index = ll_name_to_index(filter_dev);
if (!filter_index)
return nodev(filter_dev);
}
if (show_details) {
if (rtnl_linkdump_req_filter(&rth, PF_BRIDGE,
(compress_vlans ?
RTEXT_FILTER_BRVLAN_COMPRESSED :
RTEXT_FILTER_BRVLAN)) < 0) {
perror("Cannon send dump request");
exit(1);
}
} else {
if (rtnl_linkdump_req(&rth, PF_BRIDGE) < 0) {
perror("Cannon send dump request");
exit(1);
if ((filter_index = ll_name_to_index(filter_dev)) == 0) {
fprintf(stderr, "Cannot find device \"%s\"\n",
filter_dev);
return -1;
}
}
new_json_obj(json);
if (rtnl_wilddump_request(&rth, PF_BRIDGE, RTM_GETLINK) < 0) {
perror("Cannon send dump request");
exit(1);
}
if (rtnl_dump_filter(&rth, print_linkinfo, stdout) < 0) {
fprintf(stderr, "Dump terminated\n");
exit(1);
}
delete_json_obj();
fflush(stdout);
return 0;
}

View File

@ -1,4 +1,3 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Get mdb table with netlink
*/
@ -16,306 +15,77 @@
#include <arpa/inet.h>
#include "libnetlink.h"
#include "utils.h"
#include "br_common.h"
#include "rt_names.h"
#include "json_print.h"
#include "utils.h"
#ifndef MDBA_RTA
#define MDBA_RTA(r) \
((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct br_port_msg))))
((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct br_port_msg))))
#endif
static unsigned int filter_index, filter_vlan;
static unsigned int filter_index;
static void usage(void)
{
fprintf(stderr,
"Usage: bridge mdb { add | del } dev DEV port PORT grp GROUP [src SOURCE] [permanent | temp] [vid VID]\n"
" bridge mdb {show} [ dev DEV ] [ vid VID ]\n");
fprintf(stderr, "Usage: bridge mdb { add | del } dev DEV port PORT grp GROUP [permanent | temp]\n");
fprintf(stderr, " bridge mdb {show} [ dev DEV ]\n");
exit(-1);
}
static bool is_temp_mcast_rtr(__u8 type)
static void br_print_router_ports(FILE *f, struct rtattr *attr)
{
return type == MDB_RTR_TYPE_TEMP_QUERY || type == MDB_RTR_TYPE_TEMP;
}
static const char *format_timer(__u32 ticks, int align)
{
struct timeval tv;
static char tbuf[32];
__jiffies_to_tv(&tv, ticks);
if (align)
snprintf(tbuf, sizeof(tbuf), "%4lu.%.2lu",
(unsigned long)tv.tv_sec,
(unsigned long)tv.tv_usec / 10000);
else
snprintf(tbuf, sizeof(tbuf), "%lu.%.2lu",
(unsigned long)tv.tv_sec,
(unsigned long)tv.tv_usec / 10000);
return tbuf;
}
void br_print_router_port_stats(struct rtattr *pattr)
{
struct rtattr *tb[MDBA_ROUTER_PATTR_MAX + 1];
parse_rtattr(tb, MDBA_ROUTER_PATTR_MAX, MDB_RTR_RTA(RTA_DATA(pattr)),
RTA_PAYLOAD(pattr) - RTA_ALIGN(sizeof(uint32_t)));
if (tb[MDBA_ROUTER_PATTR_TIMER]) {
__u32 timer = rta_getattr_u32(tb[MDBA_ROUTER_PATTR_TIMER]);
print_string(PRINT_ANY, "timer", " %s",
format_timer(timer, 1));
}
if (tb[MDBA_ROUTER_PATTR_TYPE]) {
__u8 type = rta_getattr_u8(tb[MDBA_ROUTER_PATTR_TYPE]);
print_string(PRINT_ANY, "type", " %s",
is_temp_mcast_rtr(type) ? "temp" : "permanent");
}
}
static void br_print_router_ports(FILE *f, struct rtattr *attr,
const char *brifname)
{
int rem = RTA_PAYLOAD(attr);
struct rtattr *i;
if (is_json_context())
open_json_array(PRINT_JSON, brifname);
else if (!show_stats)
fprintf(f, "router ports on %s: ", brifname);
for (i = RTA_DATA(attr); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) {
uint32_t *port_ifindex = RTA_DATA(i);
const char *port_ifname = ll_index_to_name(*port_ifindex);
if (is_json_context()) {
open_json_object(NULL);
print_string(PRINT_JSON, "port", NULL, port_ifname);
if (show_stats)
br_print_router_port_stats(i);
close_json_object();
} else if (show_stats) {
fprintf(f, "router ports on %s: %s",
brifname, port_ifname);
br_print_router_port_stats(i);
fprintf(f, "\n");
} else {
fprintf(f, "%s ", port_ifname);
}
}
if (!show_stats)
print_nl();
close_json_array(PRINT_JSON, NULL);
}
static void print_src_entry(struct rtattr *src_attr, int af, const char *sep)
{
struct rtattr *stb[MDBA_MDB_SRCATTR_MAX + 1];
SPRINT_BUF(abuf);
const char *addr;
__u32 timer_val;
parse_rtattr_nested(stb, MDBA_MDB_SRCATTR_MAX, src_attr);
if (!stb[MDBA_MDB_SRCATTR_ADDRESS] || !stb[MDBA_MDB_SRCATTR_TIMER])
return;
addr = inet_ntop(af, RTA_DATA(stb[MDBA_MDB_SRCATTR_ADDRESS]), abuf,
sizeof(abuf));
if (!addr)
return;
timer_val = rta_getattr_u32(stb[MDBA_MDB_SRCATTR_TIMER]);
open_json_object(NULL);
print_string(PRINT_FP, NULL, "%s", sep);
print_color_string(PRINT_ANY, ifa_family_color(af),
"address", "%s", addr);
print_string(PRINT_ANY, "timer", "/%s", format_timer(timer_val, 0));
close_json_object();
}
static void print_mdb_entry(FILE *f, int ifindex, const struct br_mdb_entry *e,
struct nlmsghdr *n, struct rtattr **tb)
{
const void *grp, *src;
const char *addr;
SPRINT_BUF(abuf);
const char *dev;
int af;
if (filter_vlan && e->vid != filter_vlan)
return;
if (!e->addr.proto) {
af = AF_PACKET;
grp = &e->addr.u.mac_addr;
} else if (e->addr.proto == htons(ETH_P_IP)) {
af = AF_INET;
grp = &e->addr.u.ip4;
} else {
af = AF_INET6;
grp = &e->addr.u.ip6;
}
dev = ll_index_to_name(ifindex);
open_json_object(NULL);
print_int(PRINT_JSON, "index", NULL, ifindex);
print_color_string(PRINT_ANY, COLOR_IFNAME, "dev", "dev %s", dev);
print_string(PRINT_ANY, "port", " port %s",
ll_index_to_name(e->ifindex));
/* The ETH_ALEN argument is ignored for all cases but AF_PACKET */
addr = rt_addr_n2a_r(af, ETH_ALEN, grp, abuf, sizeof(abuf));
if (!addr)
return;
print_color_string(PRINT_ANY, ifa_family_color(af),
"grp", " grp %s", addr);
if (tb && tb[MDBA_MDB_EATTR_SOURCE]) {
src = (const void *)RTA_DATA(tb[MDBA_MDB_EATTR_SOURCE]);
print_color_string(PRINT_ANY, ifa_family_color(af),
"src", " src %s",
inet_ntop(af, src, abuf, sizeof(abuf)));
}
print_string(PRINT_ANY, "state", " %s",
(e->state & MDB_PERMANENT) ? "permanent" : "temp");
if (show_details && tb) {
if (tb[MDBA_MDB_EATTR_GROUP_MODE]) {
__u8 mode = rta_getattr_u8(tb[MDBA_MDB_EATTR_GROUP_MODE]);
print_string(PRINT_ANY, "filter_mode", " filter_mode %s",
mode == MCAST_INCLUDE ? "include" :
"exclude");
}
if (tb[MDBA_MDB_EATTR_SRC_LIST]) {
struct rtattr *i, *attr = tb[MDBA_MDB_EATTR_SRC_LIST];
const char *sep = " ";
int rem;
open_json_array(PRINT_ANY, is_json_context() ?
"source_list" :
" source_list");
rem = RTA_PAYLOAD(attr);
for (i = RTA_DATA(attr); RTA_OK(i, rem);
i = RTA_NEXT(i, rem)) {
print_src_entry(i, af, sep);
sep = ",";
}
close_json_array(PRINT_JSON, NULL);
}
if (tb[MDBA_MDB_EATTR_RTPROT]) {
__u8 rtprot = rta_getattr_u8(tb[MDBA_MDB_EATTR_RTPROT]);
SPRINT_BUF(rtb);
print_string(PRINT_ANY, "protocol", " proto %s ",
rtnl_rtprot_n2a(rtprot, rtb, sizeof(rtb)));
}
}
open_json_array(PRINT_JSON, "flags");
if (e->flags & MDB_FLAGS_OFFLOAD)
print_string(PRINT_ANY, NULL, " %s", "offload");
if (e->flags & MDB_FLAGS_FAST_LEAVE)
print_string(PRINT_ANY, NULL, " %s", "fast_leave");
if (e->flags & MDB_FLAGS_STAR_EXCL)
print_string(PRINT_ANY, NULL, " %s", "added_by_star_ex");
if (e->flags & MDB_FLAGS_BLOCKED)
print_string(PRINT_ANY, NULL, " %s", "blocked");
close_json_array(PRINT_JSON, NULL);
if (e->vid)
print_uint(PRINT_ANY, "vid", " vid %u", e->vid);
if (show_stats && tb && tb[MDBA_MDB_EATTR_TIMER]) {
__u32 timer = rta_getattr_u32(tb[MDBA_MDB_EATTR_TIMER]);
print_string(PRINT_ANY, "timer", " %s",
format_timer(timer, 1));
}
print_nl();
close_json_object();
}
static void br_print_mdb_entry(FILE *f, int ifindex, struct rtattr *attr,
struct nlmsghdr *n)
{
struct rtattr *etb[MDBA_MDB_EATTR_MAX + 1];
struct br_mdb_entry *e;
uint32_t *port_ifindex;
struct rtattr *i;
int rem;
rem = RTA_PAYLOAD(attr);
for (i = RTA_DATA(attr); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) {
e = RTA_DATA(i);
parse_rtattr_flags(etb, MDBA_MDB_EATTR_MAX, MDB_RTA(RTA_DATA(i)),
RTA_PAYLOAD(i) - RTA_ALIGN(sizeof(*e)),
NLA_F_NESTED);
print_mdb_entry(f, ifindex, e, n, etb);
port_ifindex = RTA_DATA(i);
fprintf(f, "%s ", ll_index_to_name(*port_ifindex));
}
fprintf(f, "\n");
}
static void print_mdb_entries(FILE *fp, struct nlmsghdr *n,
int ifindex, struct rtattr *mdb)
static void print_mdb_entry(FILE *f, int ifindex, struct br_mdb_entry *e)
{
SPRINT_BUF(abuf);
if (e->addr.proto == htons(ETH_P_IP))
fprintf(f, "dev %s port %s grp %s %s\n", ll_index_to_name(ifindex),
ll_index_to_name(e->ifindex),
inet_ntop(AF_INET, &e->addr.u.ip4, abuf, sizeof(abuf)),
(e->state & MDB_PERMANENT) ? "permanent" : "temp");
else
fprintf(f, "dev %s port %s grp %s %s\n", ll_index_to_name(ifindex),
ll_index_to_name(e->ifindex),
inet_ntop(AF_INET6, &e->addr.u.ip6, abuf, sizeof(abuf)),
(e->state & MDB_PERMANENT) ? "permanent" : "temp");
}
static void br_print_mdb_entry(FILE *f, int ifindex, struct rtattr *attr)
{
int rem = RTA_PAYLOAD(mdb);
struct rtattr *i;
int rem;
struct br_mdb_entry *e;
for (i = RTA_DATA(mdb); RTA_OK(i, rem); i = RTA_NEXT(i, rem))
br_print_mdb_entry(fp, ifindex, i, n);
}
static void print_router_entries(FILE *fp, struct nlmsghdr *n,
int ifindex, struct rtattr *router)
{
const char *brifname = ll_index_to_name(ifindex);
if (n->nlmsg_type == RTM_GETMDB) {
if (show_details)
br_print_router_ports(fp, router, brifname);
} else {
struct rtattr *i = RTA_DATA(router);
uint32_t *port_ifindex = RTA_DATA(i);
const char *port_name = ll_index_to_name(*port_ifindex);
if (is_json_context()) {
open_json_array(PRINT_JSON, brifname);
open_json_object(NULL);
print_string(PRINT_JSON, "port", NULL,
port_name);
close_json_object();
close_json_array(PRINT_JSON, NULL);
} else {
fprintf(fp, "router port dev %s master %s\n",
port_name, brifname);
}
rem = RTA_PAYLOAD(attr);
for (i = RTA_DATA(attr); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) {
e = RTA_DATA(i);
print_mdb_entry(f, ifindex, e);
}
}
static int __parse_mdb_nlmsg(struct nlmsghdr *n, struct rtattr **tb)
int print_mdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
{
FILE *fp = arg;
struct br_port_msg *r = NLMSG_DATA(n);
int len = n->nlmsg_len;
struct rtattr * tb[MDBA_MAX+1];
if (n->nlmsg_type != RTM_GETMDB &&
n->nlmsg_type != RTM_NEWMDB &&
n->nlmsg_type != RTM_DELMDB) {
fprintf(stderr,
"Not RTM_GETMDB, RTM_NEWMDB or RTM_DELMDB: %08x %08x %08x\n",
if (n->nlmsg_type != RTM_GETMDB && n->nlmsg_type != RTM_NEWMDB && n->nlmsg_type != RTM_DELMDB) {
fprintf(stderr, "Not RTM_GETMDB, RTM_NEWMDB or RTM_DELMDB: %08x %08x %08x\n",
n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
return 0;
@ -332,62 +102,20 @@ static int __parse_mdb_nlmsg(struct nlmsghdr *n, struct rtattr **tb)
parse_rtattr(tb, MDBA_MAX, MDBA_RTA(r), n->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
return 1;
}
if (tb[MDBA_MDB]) {
struct rtattr *i;
int rem = RTA_PAYLOAD(tb[MDBA_MDB]);
static int print_mdbs(struct nlmsghdr *n, void *arg)
{
struct br_port_msg *r = NLMSG_DATA(n);
struct rtattr *tb[MDBA_MAX+1];
FILE *fp = arg;
int ret;
for (i = RTA_DATA(tb[MDBA_MDB]); RTA_OK(i, rem); i = RTA_NEXT(i, rem))
br_print_mdb_entry(fp, r->ifindex, i);
}
ret = __parse_mdb_nlmsg(n, tb);
if (ret != 1)
return ret;
if (tb[MDBA_MDB])
print_mdb_entries(fp, n, r->ifindex, tb[MDBA_MDB]);
return 0;
}
static int print_rtrs(struct nlmsghdr *n, void *arg)
{
struct br_port_msg *r = NLMSG_DATA(n);
struct rtattr *tb[MDBA_MAX+1];
FILE *fp = arg;
int ret;
ret = __parse_mdb_nlmsg(n, tb);
if (ret != 1)
return ret;
if (tb[MDBA_ROUTER])
print_router_entries(fp, n, r->ifindex, tb[MDBA_ROUTER]);
return 0;
}
int print_mdb_mon(struct nlmsghdr *n, void *arg)
{
struct br_port_msg *r = NLMSG_DATA(n);
struct rtattr *tb[MDBA_MAX+1];
FILE *fp = arg;
int ret;
ret = __parse_mdb_nlmsg(n, tb);
if (ret != 1)
return ret;
if (n->nlmsg_type == RTM_DELMDB)
print_bool(PRINT_ANY, "deleted", "Deleted ", true);
if (tb[MDBA_MDB])
print_mdb_entries(fp, n, r->ifindex, tb[MDBA_MDB]);
if (tb[MDBA_ROUTER])
print_router_entries(fp, n, r->ifindex, tb[MDBA_ROUTER]);
if (tb[MDBA_ROUTER]) {
if (show_details) {
fprintf(fp, "router ports on %s: ", ll_index_to_name(r->ifindex));
br_print_router_ports(fp, tb[MDBA_ROUTER]);
}
}
return 0;
}
@ -402,91 +130,49 @@ static int mdb_show(int argc, char **argv)
if (filter_dev)
duparg("dev", *argv);
filter_dev = *argv;
} else if (strcmp(*argv, "vid") == 0) {
NEXT_ARG();
if (filter_vlan)
duparg("vid", *argv);
filter_vlan = atoi(*argv);
}
argc--; argv++;
}
if (filter_dev) {
filter_index = ll_name_to_index(filter_dev);
if (!filter_index)
return nodev(filter_dev);
filter_index = if_nametoindex(filter_dev);
if (filter_index == 0) {
fprintf(stderr, "Cannot find device \"%s\"\n",
filter_dev);
return -1;
}
}
new_json_obj(json);
open_json_object(NULL);
/* get mdb entries */
if (rtnl_mdbdump_req(&rth, PF_BRIDGE) < 0) {
if (rtnl_wilddump_request(&rth, PF_BRIDGE, RTM_GETMDB) < 0) {
perror("Cannot send dump request");
return -1;
}
open_json_array(PRINT_JSON, "mdb");
if (rtnl_dump_filter(&rth, print_mdbs, stdout) < 0) {
if (rtnl_dump_filter(&rth, print_mdb, stdout) < 0) {
fprintf(stderr, "Dump terminated\n");
return -1;
}
close_json_array(PRINT_JSON, NULL);
/* get router ports */
if (rtnl_mdbdump_req(&rth, PF_BRIDGE) < 0) {
perror("Cannot send dump request");
return -1;
}
open_json_object("router");
if (rtnl_dump_filter(&rth, print_rtrs, stdout) < 0) {
fprintf(stderr, "Dump terminated\n");
return -1;
}
close_json_object();
close_json_object();
delete_json_obj();
fflush(stdout);
return 0;
}
static int mdb_parse_grp(const char *grp, struct br_mdb_entry *e)
{
if (inet_pton(AF_INET, grp, &e->addr.u.ip4)) {
e->addr.proto = htons(ETH_P_IP);
return 0;
}
if (inet_pton(AF_INET6, grp, &e->addr.u.ip6)) {
e->addr.proto = htons(ETH_P_IPV6);
return 0;
}
if (ll_addr_a2n((char *)e->addr.u.mac_addr, sizeof(e->addr.u.mac_addr),
grp) == ETH_ALEN) {
e->addr.proto = 0;
return 0;
}
return -1;
}
static int mdb_modify(int cmd, int flags, int argc, char **argv)
{
struct {
struct nlmsghdr n;
struct nlmsghdr n;
struct br_port_msg bpm;
char buf[1024];
} req = {
.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct br_port_msg)),
.n.nlmsg_flags = NLM_F_REQUEST | flags,
.n.nlmsg_type = cmd,
.bpm.family = PF_BRIDGE,
};
char *d = NULL, *p = NULL, *grp = NULL, *src = NULL;
struct br_mdb_entry entry = {};
short vid = 0;
char buf[1024];
} req;
struct br_mdb_entry entry;
char *d = NULL, *p = NULL, *grp = NULL;
memset(&req, 0, sizeof(req));
memset(&entry, 0, sizeof(entry));
req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct br_port_msg));
req.n.nlmsg_flags = NLM_F_REQUEST|flags;
req.n.nlmsg_type = cmd;
req.bpm.family = PF_BRIDGE;
while (argc > 0) {
if (strcmp(*argv, "dev") == 0) {
@ -503,12 +189,6 @@ static int mdb_modify(int cmd, int flags, int argc, char **argv)
entry.state |= MDB_PERMANENT;
} else if (strcmp(*argv, "temp") == 0) {
;/* nothing */
} else if (strcmp(*argv, "vid") == 0) {
NEXT_ARG();
vid = atoi(*argv);
} else if (strcmp(*argv, "src") == 0) {
NEXT_ARG();
src = *argv;
} else {
if (matches(*argv, "help") == 0)
usage();
@ -522,40 +202,29 @@ static int mdb_modify(int cmd, int flags, int argc, char **argv)
}
req.bpm.ifindex = ll_name_to_index(d);
if (!req.bpm.ifindex)
return nodev(d);
entry.ifindex = ll_name_to_index(p);
if (!entry.ifindex)
return nodev(p);
if (mdb_parse_grp(grp, &entry)) {
fprintf(stderr, "Invalid address \"%s\"\n", grp);
if (req.bpm.ifindex == 0) {
fprintf(stderr, "Cannot find device \"%s\"\n", d);
return -1;
}
entry.vid = vid;
addattr_l(&req.n, sizeof(req), MDBA_SET_ENTRY, &entry, sizeof(entry));
if (src) {
struct rtattr *nest = addattr_nest(&req.n, sizeof(req),
MDBA_SET_ENTRY_ATTRS);
struct in6_addr src_ip6;
__be32 src_ip4;
nest->rta_type |= NLA_F_NESTED;
if (!inet_pton(AF_INET, src, &src_ip4)) {
if (!inet_pton(AF_INET6, src, &src_ip6)) {
fprintf(stderr, "Invalid source address \"%s\"\n", src);
return -1;
}
addattr_l(&req.n, sizeof(req), MDBE_ATTR_SOURCE, &src_ip6, sizeof(src_ip6));
} else {
addattr32(&req.n, sizeof(req), MDBE_ATTR_SOURCE, src_ip4);
}
addattr_nest_end(&req.n, nest);
entry.ifindex = ll_name_to_index(p);
if (entry.ifindex == 0) {
fprintf(stderr, "Cannot find device \"%s\"\n", p);
return -1;
}
if (rtnl_talk(&rth, &req.n, NULL) < 0)
if (!inet_pton(AF_INET, grp, &entry.addr.u.ip4)) {
if (!inet_pton(AF_INET6, grp, &entry.addr.u.ip6)) {
fprintf(stderr, "Invalid address \"%s\"\n", grp);
return -1;
} else
entry.addr.proto = htons(ETH_P_IPV6);
} else
entry.addr.proto = htons(ETH_P_IP);
addattr_l(&req.n, sizeof(req), MDBA_SET_ENTRY, &entry, sizeof(entry));
if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
return -1;
return 0;

View File

@ -27,15 +27,15 @@
static void usage(void) __attribute__((noreturn));
static int prefix_banner;
int prefix_banner;
static void usage(void)
{
fprintf(stderr, "Usage: bridge monitor [file | link | fdb | mdb | vlan | all]\n");
fprintf(stderr, "Usage: bridge monitor [file | link | fdb | mdb | all]\n");
exit(-1);
}
static int accept_msg(struct rtnl_ctrl_data *ctrl,
static int accept_msg(const struct sockaddr_nl *who,
struct nlmsghdr *n, void *arg)
{
FILE *fp = arg;
@ -49,30 +49,24 @@ static int accept_msg(struct rtnl_ctrl_data *ctrl,
if (prefix_banner)
fprintf(fp, "[LINK]");
return print_linkinfo(n, arg);
return print_linkinfo(who, n, arg);
case RTM_NEWNEIGH:
case RTM_DELNEIGH:
if (prefix_banner)
fprintf(fp, "[NEIGH]");
return print_fdb(n, arg);
return print_fdb(who, n, arg);
case RTM_NEWMDB:
case RTM_DELMDB:
if (prefix_banner)
fprintf(fp, "[MDB]");
return print_mdb_mon(n, arg);
return print_mdb(who, n, arg);
case NLMSG_TSTAMP:
print_nlmsg_timestamp(fp, n);
return 0;
case RTM_NEWVLAN:
case RTM_DELVLAN:
if (prefix_banner)
fprintf(fp, "[VLAN]");
return print_vlan_rtm(n, arg, true, false);
default:
return 0;
}
@ -81,11 +75,10 @@ static int accept_msg(struct rtnl_ctrl_data *ctrl,
int do_monitor(int argc, char **argv)
{
char *file = NULL;
unsigned int groups = ~RTMGRP_TC;
int llink = 0;
int lneigh = 0;
int lmdb = 0;
int lvlan = 0;
unsigned groups = ~RTMGRP_TC;
int llink=0;
int lneigh=0;
int lmdb=0;
rtnl_close(&rth);
@ -94,7 +87,7 @@ int do_monitor(int argc, char **argv)
NEXT_ARG();
file = *argv;
} else if (matches(*argv, "link") == 0) {
llink = 1;
llink=1;
groups = 0;
} else if (matches(*argv, "fdb") == 0) {
lneigh = 1;
@ -102,13 +95,9 @@ int do_monitor(int argc, char **argv)
} else if (matches(*argv, "mdb") == 0) {
lmdb = 1;
groups = 0;
} else if (matches(*argv, "vlan") == 0) {
lvlan = 1;
groups = 0;
} else if (strcmp(*argv, "all") == 0) {
groups = ~RTMGRP_TC;
lvlan = 1;
prefix_banner = 1;
prefix_banner=1;
} else if (matches(*argv, "help") == 0) {
usage();
} else {
@ -132,7 +121,6 @@ int do_monitor(int argc, char **argv)
if (file) {
FILE *fp;
int err;
fp = fopen(file, "r");
if (fp == NULL) {
perror("Cannot fopen");
@ -145,12 +133,6 @@ int do_monitor(int argc, char **argv)
if (rtnl_open(&rth, groups) < 0)
exit(1);
if (lvlan && rtnl_add_nl_group(&rth, RTNLGRP_BRVLAN) < 0) {
fprintf(stderr, "Failed to add bridge vlan group to list\n");
exit(1);
}
ll_init_map(&rth);
if (rtnl_listen(&rth, accept_msg, stdout) < 0)
@ -158,3 +140,4 @@ int do_monitor(int argc, char **argv)
return 0;
}

File diff suppressed because it is too large Load Diff

496
configure vendored
View File

@ -1,28 +1,38 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
#! /bin/bash
# This is not an autoconf generated configure
INCLUDE="$PWD/include"
PREFIX="/usr"
LIBDIR="\${prefix}/lib"
# Output file which is input to Makefile
CONFIG=config.mk
#
INCLUDE=${1:-"$PWD/include"}
# Make a temp directory in build tree.
TMPDIR=$(mktemp -d config.XXXXXX)
trap 'status=$?; rm -rf $TMPDIR; exit $status' EXIT HUP INT QUIT TERM
check_prog()
{
echo -n "$2"
command -v $1 >/dev/null 2>&1 && (echo "$3:=y" >> Config; echo "yes") || (echo "no"; return 1)
}
check_docs()
{
if check_prog latex " latex: " HAVE_LATEX; then
check_prog pdflatex " pdflatex: " HAVE_PDFLATEX || echo " WARNING: no PDF docs can be built from LaTeX files"
check_prog sgml2latex " sgml2latex: " HAVE_SGML2LATEX || echo " WARNING: no LaTeX files can be build from SGML files"
else
echo " WARNING: no docs can be built from LaTeX files"
fi
check_prog sgml2html " sgml2html: " HAVE_SGML2HTML || echo " WARNING: no HTML docs can be built from SGML"
}
check_toolchain()
{
: ${PKG_CONFIG:=pkg-config}
: ${AR=ar}
: ${CC=gcc}
: ${YACC=bison}
echo "PKG_CONFIG:=${PKG_CONFIG}" >>$CONFIG
echo "AR:=${AR}" >>$CONFIG
echo "CC:=${CC}" >>$CONFIG
echo "YACC:=${YACC}" >>$CONFIG
echo "PKG_CONFIG:=${PKG_CONFIG}" >>Config
echo "AR:=${AR}" >>Config
echo "CC:=${CC}" >>Config
}
check_atm()
@ -36,8 +46,10 @@ int main(int argc, char **argv) {
}
EOF
if $CC -I$INCLUDE -o $TMPDIR/atmtest $TMPDIR/atmtest.c -latm >/dev/null 2>&1; then
echo "TC_CONFIG_ATM:=y" >>$CONFIG
$CC -I$INCLUDE -o $TMPDIR/atmtest $TMPDIR/atmtest.c -latm >/dev/null 2>&1
if [ $? -eq 0 ]
then
echo "TC_CONFIG_ATM:=y" >>Config
echo yes
else
echo no
@ -45,13 +57,6 @@ EOF
rm -f $TMPDIR/atmtest.c $TMPDIR/atmtest
}
check_xtables()
{
if ! ${PKG_CONFIG} xtables --exists; then
echo "TC_CONFIG_NO_XT:=y" >>$CONFIG
fi
}
check_xt()
{
#check if we have xtables from iptables >= 1.4.5.
@ -75,8 +80,9 @@ int main(int argc, char **argv)
EOF
if $CC -I$INCLUDE $IPTC -o $TMPDIR/ipttest $TMPDIR/ipttest.c $IPTL \
$(${PKG_CONFIG} xtables --cflags --libs) -ldl >/dev/null 2>&1; then
echo "TC_CONFIG_XT:=y" >>$CONFIG
$(${PKG_CONFIG} xtables --cflags --libs) -ldl >/dev/null 2>&1
then
echo "TC_CONFIG_XT:=y" >>Config
echo "using xtables"
fi
rm -f $TMPDIR/ipttest.c $TMPDIR/ipttest
@ -84,10 +90,13 @@ EOF
check_xt_old()
{
# bail if previous XT checks has already succeeded.
grep -q TC_CONFIG_XT $CONFIG && return
# bail if previous XT checks has already succeded.
if grep -q TC_CONFIG_XT Config
then
return
fi
#check if we don't need our internal header ..
#check if we dont need our internal header ..
cat >$TMPDIR/ipttest.c <<EOF
#include <xtables.h>
char *lib_dir;
@ -109,8 +118,10 @@ int main(int argc, char **argv) {
EOF
if $CC -I$INCLUDE $IPTC -o $TMPDIR/ipttest $TMPDIR/ipttest.c $IPTL -ldl >/dev/null 2>&1; then
echo "TC_CONFIG_XT_OLD:=y" >>$CONFIG
$CC -I$INCLUDE $IPTC -o $TMPDIR/ipttest $TMPDIR/ipttest.c $IPTL -ldl >/dev/null 2>&1
if [ $? -eq 0 ]
then
echo "TC_CONFIG_XT_OLD:=y" >>Config
echo "using old xtables (no need for xt-internal.h)"
fi
rm -f $TMPDIR/ipttest.c $TMPDIR/ipttest
@ -118,8 +129,11 @@ EOF
check_xt_old_internal_h()
{
# bail if previous XT checks has already succeeded.
grep -q TC_CONFIG_XT $CONFIG && return
# bail if previous XT checks has already succeded.
if grep -q TC_CONFIG_XT Config
then
return
fi
#check if we need our own internal.h
cat >$TMPDIR/ipttest.c <<EOF
@ -143,25 +157,20 @@ int main(int argc, char **argv) {
}
EOF
if $CC -I$INCLUDE $IPTC -o $TMPDIR/ipttest $TMPDIR/ipttest.c $IPTL -ldl >/dev/null 2>&1; then
$CC -I$INCLUDE $IPTC -o $TMPDIR/ipttest $TMPDIR/ipttest.c $IPTL -ldl >/dev/null 2>&1
if [ $? -eq 0 ]
then
echo "using old xtables with xt-internal.h"
echo "TC_CONFIG_XT_OLD_H:=y" >>$CONFIG
echo "TC_CONFIG_XT_OLD_H:=y" >>Config
fi
rm -f $TMPDIR/ipttest.c $TMPDIR/ipttest
}
check_lib_dir()
{
LIBDIR=$(echo $LIBDIR | sed "s|\${prefix}|$PREFIX|")
echo -n "lib directory: "
echo "$LIBDIR"
echo "LIBDIR:=$LIBDIR" >> $CONFIG
}
check_ipt()
{
if ! grep TC_CONFIG_XT $CONFIG > /dev/null; then
if ! grep TC_CONFIG_XT Config > /dev/null
then
echo "using iptables"
fi
}
@ -171,16 +180,16 @@ check_ipt_lib_dir()
IPT_LIB_DIR=$(${PKG_CONFIG} --variable=xtlibdir xtables)
if [ -n "$IPT_LIB_DIR" ]; then
echo $IPT_LIB_DIR
echo "IPT_LIB_DIR:=$IPT_LIB_DIR" >> $CONFIG
echo "IPT_LIB_DIR:=$IPT_LIB_DIR" >> Config
return
fi
for dir in /lib /usr/lib /usr/local/lib; do
for file in "xtables" "iptables"; do
file="$dir/$file/lib*t_*so"
for dir in /lib /usr/lib /usr/local/lib
do
for file in $dir/{xtables,iptables}/lib*t_*so ; do
if [ -f $file ]; then
echo ${file%/*}
echo "IPT_LIB_DIR:=${file%/*}" >> $CONFIG
echo "IPT_LIB_DIR:=${file%/*}" >> Config
return
fi
done
@ -198,41 +207,17 @@ int main(int argc, char **argv)
return 0;
}
EOF
if $CC -I$INCLUDE -o $TMPDIR/setnstest $TMPDIR/setnstest.c >/dev/null 2>&1; then
echo "IP_CONFIG_SETNS:=y" >>$CONFIG
$CC -I$INCLUDE -o $TMPDIR/setnstest $TMPDIR/setnstest.c >/dev/null 2>&1
if [ $? -eq 0 ]
then
echo "IP_CONFIG_SETNS:=y" >>Config
echo "yes"
echo "CFLAGS += -DHAVE_SETNS" >>$CONFIG
else
echo "no"
fi
rm -f $TMPDIR/setnstest.c $TMPDIR/setnstest
}
check_name_to_handle_at()
{
cat >$TMPDIR/name_to_handle_at_test.c <<EOF
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
int main(int argc, char **argv)
{
struct file_handle *fhp;
int mount_id, flags, dirfd;
char *pathname;
name_to_handle_at(dirfd, pathname, fhp, &mount_id, flags);
return 0;
}
EOF
if $CC -I$INCLUDE -o $TMPDIR/name_to_handle_at_test $TMPDIR/name_to_handle_at_test.c >/dev/null 2>&1; then
echo "yes"
echo "CFLAGS += -DHAVE_HANDLE_AT" >>$CONFIG
else
echo "no"
fi
rm -f $TMPDIR/name_to_handle_at_test.c $TMPDIR/name_to_handle_at_test
}
check_ipset()
{
cat >$TMPDIR/ipsettest.c <<EOF
@ -244,7 +229,7 @@ typedef unsigned short ip_set_id_t;
#include <linux/netfilter/xt_set.h>
struct xt_set_info info;
#if IPSET_PROTOCOL == 6 || IPSET_PROTOCOL == 7
#if IPSET_PROTOCOL == 6
int main(void)
{
return IPSET_MAXNAMELEN;
@ -254,8 +239,9 @@ int main(void)
#endif
EOF
if $CC -I$INCLUDE -o $TMPDIR/ipsettest $TMPDIR/ipsettest.c >/dev/null 2>&1; then
echo "TC_CONFIG_IPSET:=y" >>$CONFIG
if $CC -I$INCLUDE -o $TMPDIR/ipsettest $TMPDIR/ipsettest.c >/dev/null 2>&1
then
echo "TC_CONFIG_IPSET:=y" >>Config
echo "yes"
else
echo "no"
@ -265,131 +251,34 @@ EOF
check_elf()
{
if ${PKG_CONFIG} libelf --exists; then
echo "HAVE_ELF:=y" >>$CONFIG
echo "yes"
cat >$TMPDIR/elftest.c <<EOF
#include <libelf.h>
#include <gelf.h>
int main(void)
{
Elf_Scn *scn;
GElf_Shdr shdr;
return elf_version(EV_CURRENT);
}
EOF
echo 'CFLAGS += -DHAVE_ELF' `${PKG_CONFIG} libelf --cflags` >> $CONFIG
echo 'LDLIBS += ' `${PKG_CONFIG} libelf --libs` >>$CONFIG
if $CC -I$INCLUDE -o $TMPDIR/elftest $TMPDIR/elftest.c -lelf >/dev/null 2>&1
then
echo "TC_CONFIG_ELF:=y" >>Config
echo "yes"
else
echo "no"
fi
}
have_libbpf_basic()
{
cat >$TMPDIR/libbpf_test.c <<EOF
#include <bpf/libbpf.h>
int main(int argc, char **argv) {
bpf_program__set_autoload(NULL, false);
bpf_map__ifindex(NULL);
bpf_map__set_pin_path(NULL, NULL);
bpf_object__open_file(NULL, NULL);
return 0;
}
EOF
$CC -o $TMPDIR/libbpf_test $TMPDIR/libbpf_test.c $LIBBPF_CFLAGS $LIBBPF_LDLIBS >/dev/null 2>&1
local ret=$?
rm -f $TMPDIR/libbpf_test.c $TMPDIR/libbpf_test
return $ret
}
have_libbpf_sec_name()
{
cat >$TMPDIR/libbpf_sec_test.c <<EOF
#include <bpf/libbpf.h>
int main(int argc, char **argv) {
void *ptr;
bpf_program__section_name(NULL);
return 0;
}
EOF
$CC -o $TMPDIR/libbpf_sec_test $TMPDIR/libbpf_sec_test.c $LIBBPF_CFLAGS $LIBBPF_LDLIBS >/dev/null 2>&1
local ret=$?
rm -f $TMPDIR/libbpf_sec_test.c $TMPDIR/libbpf_sec_test
return $ret
}
check_force_libbpf_on()
{
# if set LIBBPF_FORCE=on but no libbpf support, just exist the config
# process to make sure we don't build without libbpf.
if [ "$LIBBPF_FORCE" = on ]; then
echo " LIBBPF_FORCE=on set, but couldn't find a usable libbpf"
exit 1
fi
}
check_libbpf()
{
# if set LIBBPF_FORCE=off, disable libbpf entirely
if [ "$LIBBPF_FORCE" = off ]; then
echo "no"
return
fi
if ! ${PKG_CONFIG} libbpf --exists && [ -z "$LIBBPF_DIR" ] ; then
echo "no"
check_force_libbpf_on
return
fi
if [ $(uname -m) = x86_64 ]; then
local LIBBPF_LIBDIR="${LIBBPF_DIR}/usr/lib64"
else
local LIBBPF_LIBDIR="${LIBBPF_DIR}/usr/lib"
fi
if [ -n "$LIBBPF_DIR" ]; then
LIBBPF_CFLAGS="-I${LIBBPF_DIR}/usr/include"
LIBBPF_LDLIBS="${LIBBPF_LIBDIR}/libbpf.a -lz -lelf"
LIBBPF_VERSION=$(PKG_CONFIG_LIBDIR=${LIBBPF_LIBDIR}/pkgconfig ${PKG_CONFIG} libbpf --modversion)
else
LIBBPF_CFLAGS=$(${PKG_CONFIG} libbpf --cflags)
LIBBPF_LDLIBS=$(${PKG_CONFIG} libbpf --libs)
LIBBPF_VERSION=$(${PKG_CONFIG} libbpf --modversion)
fi
if ! have_libbpf_basic; then
echo "no"
echo " libbpf version $LIBBPF_VERSION is too low, please update it to at least 0.1.0"
check_force_libbpf_on
return
else
echo "HAVE_LIBBPF:=y" >> $CONFIG
echo 'CFLAGS += -DHAVE_LIBBPF ' $LIBBPF_CFLAGS >> $CONFIG
echo "CFLAGS += -DLIBBPF_VERSION=\\\"$LIBBPF_VERSION\\\"" >> $CONFIG
echo 'LDLIBS += ' $LIBBPF_LDLIBS >> $CONFIG
if [ -z "$LIBBPF_DIR" ]; then
echo "CFLAGS += -DLIBBPF_DYNAMIC" >> $CONFIG
fi
fi
# bpf_program__title() is deprecated since libbpf 0.2.0, use
# bpf_program__section_name() instead if we support
if have_libbpf_sec_name; then
echo "HAVE_LIBBPF_SECTION_NAME:=y" >> $CONFIG
echo 'CFLAGS += -DHAVE_LIBBPF_SECTION_NAME ' >> $CONFIG
fi
echo "yes"
echo " libbpf version $LIBBPF_VERSION"
rm -f $TMPDIR/elftest.c $TMPDIR/elftest
}
check_selinux()
# SELinux is a compile time option in the ss utility
{
if ${PKG_CONFIG} libselinux --exists; then
echo "HAVE_SELINUX:=y" >>$CONFIG
if ${PKG_CONFIG} libselinux --exists
then
echo "HAVE_SELINUX:=y" >>Config
echo "yes"
echo 'LDLIBS +=' `${PKG_CONFIG} --libs libselinux` >>$CONFIG
echo 'CFLAGS += -DHAVE_SELINUX' `${PKG_CONFIG} --cflags libselinux` >>$CONFIG
else
echo "no"
fi
@ -397,174 +286,16 @@ check_selinux()
check_mnl()
{
if ${PKG_CONFIG} libmnl --exists; then
echo "HAVE_MNL:=y" >>$CONFIG
echo "yes"
echo 'CFLAGS += -DHAVE_LIBMNL' `${PKG_CONFIG} libmnl --cflags` >>$CONFIG
echo 'LDLIBS +=' `${PKG_CONFIG} libmnl --libs` >> $CONFIG
if ${PKG_CONFIG} libmnl --exists
then
echo "HAVE_MNL:=y" >>Config
echo -n "yes"
else
echo "no"
echo -n "no"
fi
}
check_berkeley_db()
{
cat >$TMPDIR/dbtest.c <<EOF
#include <fcntl.h>
#include <stdlib.h>
#include <db_185.h>
int main(int argc, char **argv) {
dbopen("/tmp/xxx_test_db.db", O_CREAT|O_RDWR, 0644, DB_HASH, NULL);
return 0;
}
EOF
if $CC -I$INCLUDE -o $TMPDIR/dbtest $TMPDIR/dbtest.c -ldb >/dev/null 2>&1; then
echo "HAVE_BERKELEY_DB:=y" >>$CONFIG
echo "yes"
else
echo "no"
fi
rm -f $TMPDIR/dbtest.c $TMPDIR/dbtest
}
check_strlcpy()
{
cat >$TMPDIR/strtest.c <<EOF
#include <string.h>
int main(int argc, char **argv) {
char dst[10];
strlcpy(dst, "test", sizeof(dst));
return 0;
}
EOF
if $CC -I$INCLUDE -o $TMPDIR/strtest $TMPDIR/strtest.c >/dev/null 2>&1; then
echo "no"
else
if ${PKG_CONFIG} libbsd --exists; then
echo 'CFLAGS += -DHAVE_LIBBSD' `${PKG_CONFIG} libbsd --cflags` >>$CONFIG
echo 'LDLIBS +=' `${PKG_CONFIG} libbsd --libs` >> $CONFIG
echo "no"
else
echo 'CFLAGS += -DNEED_STRLCPY' >>$CONFIG
echo "yes"
fi
fi
rm -f $TMPDIR/strtest.c $TMPDIR/strtest
}
check_cap()
{
if ${PKG_CONFIG} libcap --exists; then
echo "HAVE_CAP:=y" >>$CONFIG
echo "yes"
echo 'CFLAGS += -DHAVE_LIBCAP' `${PKG_CONFIG} libcap --cflags` >>$CONFIG
echo 'LDLIBS +=' `${PKG_CONFIG} libcap --libs` >> $CONFIG
else
echo "no"
fi
}
quiet_config()
{
cat <<EOF
# user can control verbosity similar to kernel builds (e.g., V=1)
ifeq ("\$(origin V)", "command line")
VERBOSE = \$(V)
endif
ifndef VERBOSE
VERBOSE = 0
endif
ifeq (\$(VERBOSE),1)
Q =
else
Q = @
endif
ifeq (\$(VERBOSE), 0)
QUIET_CC = @echo ' CC '\$@;
QUIET_AR = @echo ' AR '\$@;
QUIET_LINK = @echo ' LINK '\$@;
QUIET_YACC = @echo ' YACC '\$@;
QUIET_LEX = @echo ' LEX '\$@;
endif
EOF
}
usage()
{
cat <<EOF
Usage: $0 [OPTIONS]
--include_dir <dir> Path to iproute2 include dir
--libdir <dir> Path to iproute2 lib dir
--libbpf_dir <dir> Path to libbpf DESTDIR
--libbpf_force <on|off> Enable/disable libbpf by force. Available options:
on: require link against libbpf, quit config if no libbpf support
off: disable libbpf probing
--prefix <dir> Path prefix of the lib files to install
-h | --help Show this usage info
EOF
exit $1
}
# Compat with the old INCLUDE path setting method.
if [ $# -eq 1 ] && [ "$(echo $1 | cut -c 1)" != '-' ]; then
INCLUDE="$1"
else
while [ "$#" -gt 0 ]; do
case "$1" in
--include_dir)
shift
INCLUDE="$1" ;;
--include_dir=*)
INCLUDE="${1#*=}" ;;
--libdir)
shift
LIBDIR="$1" ;;
--libdir=*)
LIBDIR="${1#*=}" ;;
--libbpf_dir)
shift
LIBBPF_DIR="$1" ;;
--libbpf_dir=*)
LIBBPF_DIR="${1#*=}" ;;
--libbpf_force)
shift
LIBBPF_FORCE="$1" ;;
--libbpf_force=*)
LIBBPF_FORCE="${1#*=}" ;;
--prefix)
shift
PREFIX="$1" ;;
--prefix=*)
PREFIX="${1#*=}" ;;
-h | --help)
usage 0 ;;
--*)
;;
*)
usage 1 ;;
esac
[ "$#" -gt 0 ] && shift
done
fi
[ -d "$INCLUDE" ] || usage 1
if [ "${LIBBPF_DIR-unused}" != "unused" ]; then
[ -d "$LIBBPF_DIR" ] || usage 1
fi
if [ "${LIBBPF_FORCE-unused}" != "unused" ]; then
if [ "$LIBBPF_FORCE" != 'on' ] && [ "$LIBBPF_FORCE" != 'off' ]; then
usage 1
fi
fi
[ -z "$PREFIX" ] && usage 1
[ -z "$LIBDIR" ] && usage 1
echo "# Generated config based on" $INCLUDE >$CONFIG
quiet_config >> $CONFIG
echo "# Generated config based on" $INCLUDE >Config
check_toolchain
echo "TC schedulers"
@ -572,52 +303,31 @@ echo "TC schedulers"
echo -n " ATM "
check_atm
check_xtables
if ! grep -q TC_CONFIG_NO_XT $CONFIG; then
echo -n " IPT "
check_xt
check_xt_old
check_xt_old_internal_h
check_ipt
echo -n " IPT "
check_xt
check_xt_old
check_xt_old_internal_h
check_ipt
echo -n " IPSET "
check_ipset
fi
echo -n " IPSET "
check_ipset
echo
check_lib_dir
if ! grep -q TC_CONFIG_NO_XT $CONFIG; then
echo -n "iptables modules directory: "
check_ipt_lib_dir
fi
echo -n -e "\niptables modules directory: "
check_ipt_lib_dir
echo -n "libc has setns: "
check_setns
echo -n "libc has name_to_handle_at: "
check_name_to_handle_at
echo -n "SELinux support: "
check_selinux
echo -n "libbpf support: "
check_libbpf
echo -n "ELF support: "
check_elf
echo -n "libmnl support: "
check_mnl
echo " (required by tipc)"
echo -n "Berkeley DB: "
check_berkeley_db
echo -n "need for strlcpy: "
check_strlcpy
echo -n "libcap support: "
check_cap
echo >> $CONFIG
echo "%.o: %.c" >> $CONFIG
echo ' $(QUIET_CC)$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(CPPFLAGS) -c -o $@ $<' >> $CONFIG
echo -e "\nDocs"
check_docs
echo ""

1
dcb/.gitignore vendored
View File

@ -1 +0,0 @@
dcb

View File

@ -1,31 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
include ../config.mk
TARGETS :=
ifeq ($(HAVE_MNL),y)
DCBOBJ = dcb.o \
dcb_app.o \
dcb_buffer.o \
dcb_dcbx.o \
dcb_ets.o \
dcb_maxrate.o \
dcb_pfc.o
TARGETS += dcb
LDLIBS += -lm
endif
all: $(TARGETS) $(LIBS)
dcb: $(DCBOBJ) $(LIBNETLINK)
$(QUIET_LINK)$(CC) $^ $(LDFLAGS) $(LDLIBS) -o $@
install: all
for i in $(TARGETS); \
do install -m 0755 $$i $(DESTDIR)$(SBINDIR); \
done
clean:
rm -f $(DCBOBJ) $(TARGETS)

611
dcb/dcb.c
View File

@ -1,611 +0,0 @@
// SPDX-License-Identifier: GPL-2.0+
#include <inttypes.h>
#include <stdio.h>
#include <linux/dcbnl.h>
#include <libmnl/libmnl.h>
#include <getopt.h>
#include "dcb.h"
#include "mnl_utils.h"
#include "namespace.h"
#include "utils.h"
#include "version.h"
static int dcb_init(struct dcb *dcb)
{
dcb->buf = malloc(MNL_SOCKET_BUFFER_SIZE);
if (dcb->buf == NULL) {
perror("Netlink buffer allocation");
return -1;
}
dcb->nl = mnlu_socket_open(NETLINK_ROUTE);
if (dcb->nl == NULL) {
perror("Open netlink socket");
goto err_socket_open;
}
new_json_obj_plain(dcb->json_output);
return 0;
err_socket_open:
free(dcb->buf);
return -1;
}
static void dcb_fini(struct dcb *dcb)
{
delete_json_obj_plain();
mnl_socket_close(dcb->nl);
free(dcb->buf);
}
static struct dcb *dcb_alloc(void)
{
struct dcb *dcb;
dcb = calloc(1, sizeof(*dcb));
if (!dcb)
return NULL;
return dcb;
}
static void dcb_free(struct dcb *dcb)
{
free(dcb);
}
struct dcb_get_attribute {
struct dcb *dcb;
int attr;
void *payload;
__u16 payload_len;
};
static int dcb_get_attribute_attr_ieee_cb(const struct nlattr *attr, void *data)
{
struct dcb_get_attribute *ga = data;
if (mnl_attr_get_type(attr) != ga->attr)
return MNL_CB_OK;
ga->payload = mnl_attr_get_payload(attr);
ga->payload_len = mnl_attr_get_payload_len(attr);
return MNL_CB_STOP;
}
static int dcb_get_attribute_attr_cb(const struct nlattr *attr, void *data)
{
if (mnl_attr_get_type(attr) != DCB_ATTR_IEEE)
return MNL_CB_OK;
return mnl_attr_parse_nested(attr, dcb_get_attribute_attr_ieee_cb, data);
}
static int dcb_get_attribute_cb(const struct nlmsghdr *nlh, void *data)
{
return mnl_attr_parse(nlh, sizeof(struct dcbmsg), dcb_get_attribute_attr_cb, data);
}
static int dcb_get_attribute_bare_cb(const struct nlmsghdr *nlh, void *data)
{
/* Bare attributes (e.g. DCB_ATTR_DCBX) are not wrapped inside an IEEE
* container, so this does not have to go through unpacking in
* dcb_get_attribute_attr_cb().
*/
return mnl_attr_parse(nlh, sizeof(struct dcbmsg),
dcb_get_attribute_attr_ieee_cb, data);
}
struct dcb_set_attribute_response {
int response_attr;
};
static int dcb_set_attribute_attr_cb(const struct nlattr *attr, void *data)
{
struct dcb_set_attribute_response *resp = data;
uint16_t len;
uint8_t err;
if (mnl_attr_get_type(attr) != resp->response_attr)
return MNL_CB_OK;
len = mnl_attr_get_payload_len(attr);
if (len != 1) {
fprintf(stderr, "Response attribute expected to have size 1, not %d\n", len);
return MNL_CB_ERROR;
}
err = mnl_attr_get_u8(attr);
if (err) {
fprintf(stderr, "Error when attempting to set attribute: %s\n",
strerror(err));
return MNL_CB_ERROR;
}
return MNL_CB_STOP;
}
static int dcb_set_attribute_cb(const struct nlmsghdr *nlh, void *data)
{
return mnl_attr_parse(nlh, sizeof(struct dcbmsg), dcb_set_attribute_attr_cb, data);
}
static int dcb_talk(struct dcb *dcb, struct nlmsghdr *nlh, mnl_cb_t cb, void *data)
{
int ret;
ret = mnl_socket_sendto(dcb->nl, nlh, nlh->nlmsg_len);
if (ret < 0) {
perror("mnl_socket_sendto");
return -1;
}
return mnlu_socket_recv_run(dcb->nl, nlh->nlmsg_seq, dcb->buf, MNL_SOCKET_BUFFER_SIZE,
cb, data);
}
static struct nlmsghdr *dcb_prepare(struct dcb *dcb, const char *dev,
uint32_t nlmsg_type, uint8_t dcb_cmd)
{
struct dcbmsg dcbm = {
.cmd = dcb_cmd,
};
struct nlmsghdr *nlh;
nlh = mnlu_msg_prepare(dcb->buf, nlmsg_type, NLM_F_REQUEST, &dcbm, sizeof(dcbm));
mnl_attr_put_strz(nlh, DCB_ATTR_IFNAME, dev);
return nlh;
}
static int __dcb_get_attribute(struct dcb *dcb, int command,
const char *dev, int attr,
void **payload_p, __u16 *payload_len_p,
int (*get_attribute_cb)(const struct nlmsghdr *nlh,
void *data))
{
struct dcb_get_attribute ga;
struct nlmsghdr *nlh;
int ret;
nlh = dcb_prepare(dcb, dev, RTM_GETDCB, command);
ga = (struct dcb_get_attribute) {
.dcb = dcb,
.attr = attr,
.payload = NULL,
};
ret = dcb_talk(dcb, nlh, get_attribute_cb, &ga);
if (ret) {
perror("Attribute read");
return ret;
}
if (ga.payload == NULL) {
perror("Attribute not found");
return -ENOENT;
}
*payload_p = ga.payload;
*payload_len_p = ga.payload_len;
return 0;
}
int dcb_get_attribute_va(struct dcb *dcb, const char *dev, int attr,
void **payload_p, __u16 *payload_len_p)
{
return __dcb_get_attribute(dcb, DCB_CMD_IEEE_GET, dev, attr,
payload_p, payload_len_p,
dcb_get_attribute_cb);
}
int dcb_get_attribute_bare(struct dcb *dcb, int cmd, const char *dev, int attr,
void **payload_p, __u16 *payload_len_p)
{
return __dcb_get_attribute(dcb, cmd, dev, attr,
payload_p, payload_len_p,
dcb_get_attribute_bare_cb);
}
int dcb_get_attribute(struct dcb *dcb, const char *dev, int attr, void *data, size_t data_len)
{
__u16 payload_len;
void *payload;
int ret;
ret = dcb_get_attribute_va(dcb, dev, attr, &payload, &payload_len);
if (ret)
return ret;
if (payload_len != data_len) {
fprintf(stderr, "Wrong len %d, expected %zd\n", payload_len, data_len);
return -EINVAL;
}
memcpy(data, payload, data_len);
return 0;
}
static int __dcb_set_attribute(struct dcb *dcb, int command, const char *dev,
int (*cb)(struct dcb *, struct nlmsghdr *, void *),
void *data, int response_attr)
{
struct dcb_set_attribute_response resp = {
.response_attr = response_attr,
};
struct nlmsghdr *nlh;
int ret;
nlh = dcb_prepare(dcb, dev, RTM_SETDCB, command);
ret = cb(dcb, nlh, data);
if (ret)
return ret;
ret = dcb_talk(dcb, nlh, dcb_set_attribute_cb, &resp);
if (ret) {
perror("Attribute write");
return ret;
}
return 0;
}
struct dcb_set_attribute_ieee_cb {
int (*cb)(struct dcb *dcb, struct nlmsghdr *nlh, void *data);
void *data;
};
static int dcb_set_attribute_ieee_cb(struct dcb *dcb, struct nlmsghdr *nlh, void *data)
{
struct dcb_set_attribute_ieee_cb *ieee_data = data;
struct nlattr *nest;
int ret;
nest = mnl_attr_nest_start(nlh, DCB_ATTR_IEEE);
ret = ieee_data->cb(dcb, nlh, ieee_data->data);
if (ret)
return ret;
mnl_attr_nest_end(nlh, nest);
return 0;
}
int dcb_set_attribute_va(struct dcb *dcb, int command, const char *dev,
int (*cb)(struct dcb *dcb, struct nlmsghdr *nlh, void *data),
void *data)
{
struct dcb_set_attribute_ieee_cb ieee_data = {
.cb = cb,
.data = data,
};
return __dcb_set_attribute(dcb, command, dev,
&dcb_set_attribute_ieee_cb, &ieee_data,
DCB_ATTR_IEEE);
}
struct dcb_set_attribute {
int attr;
const void *data;
size_t data_len;
};
static int dcb_set_attribute_put(struct dcb *dcb, struct nlmsghdr *nlh, void *data)
{
struct dcb_set_attribute *dsa = data;
mnl_attr_put(nlh, dsa->attr, dsa->data_len, dsa->data);
return 0;
}
int dcb_set_attribute(struct dcb *dcb, const char *dev, int attr, const void *data, size_t data_len)
{
struct dcb_set_attribute dsa = {
.attr = attr,
.data = data,
.data_len = data_len,
};
return dcb_set_attribute_va(dcb, DCB_CMD_IEEE_SET, dev,
&dcb_set_attribute_put, &dsa);
}
int dcb_set_attribute_bare(struct dcb *dcb, int command, const char *dev,
int attr, const void *data, size_t data_len,
int response_attr)
{
struct dcb_set_attribute dsa = {
.attr = attr,
.data = data,
.data_len = data_len,
};
return __dcb_set_attribute(dcb, command, dev,
&dcb_set_attribute_put, &dsa, response_attr);
}
void dcb_print_array_u8(const __u8 *array, size_t size)
{
SPRINT_BUF(b);
size_t i;
for (i = 0; i < size; i++) {
snprintf(b, sizeof(b), "%zd:%%d ", i);
print_uint(PRINT_ANY, NULL, b, array[i]);
}
}
void dcb_print_array_u64(const __u64 *array, size_t size)
{
SPRINT_BUF(b);
size_t i;
for (i = 0; i < size; i++) {
snprintf(b, sizeof(b), "%zd:%%" PRIu64 " ", i);
print_u64(PRINT_ANY, NULL, b, array[i]);
}
}
void dcb_print_array_on_off(const __u8 *array, size_t size)
{
SPRINT_BUF(b);
size_t i;
for (i = 0; i < size; i++) {
snprintf(b, sizeof(b), "%zd:%%s ", i);
print_on_off(PRINT_ANY, NULL, b, array[i]);
}
}
void dcb_print_array_kw(const __u8 *array, size_t array_size,
const char *const kw[], size_t kw_size)
{
SPRINT_BUF(b);
size_t i;
for (i = 0; i < array_size; i++) {
__u8 emt = array[i];
snprintf(b, sizeof(b), "%zd:%%s ", i);
if (emt < kw_size && kw[emt])
print_string(PRINT_ANY, NULL, b, kw[emt]);
else
print_string(PRINT_ANY, NULL, b, "???");
}
}
void dcb_print_named_array(const char *json_name, const char *fp_name,
const __u8 *array, size_t size,
void (*print_array)(const __u8 *, size_t))
{
open_json_array(PRINT_JSON, json_name);
print_string(PRINT_FP, NULL, "%s ", fp_name);
print_array(array, size);
close_json_array(PRINT_JSON, json_name);
}
int dcb_parse_mapping(const char *what_key, __u32 key, __u32 max_key,
const char *what_value, __u64 value, __u64 max_value,
void (*set_array)(__u32 index, __u64 value, void *data),
void *set_array_data)
{
bool is_all = key == (__u32) -1;
if (!is_all && key > max_key) {
fprintf(stderr, "In %s:%s mapping, %s is expected to be 0..%d\n",
what_key, what_value, what_key, max_key);
return -EINVAL;
}
if (value > max_value) {
fprintf(stderr, "In %s:%s mapping, %s is expected to be 0..%llu\n",
what_key, what_value, what_value, max_value);
return -EINVAL;
}
if (is_all) {
for (key = 0; key <= max_key; key++)
set_array(key, value, set_array_data);
} else {
set_array(key, value, set_array_data);
}
return 0;
}
void dcb_set_u8(__u32 key, __u64 value, void *data)
{
__u8 *array = data;
array[key] = value;
}
void dcb_set_u32(__u32 key, __u64 value, void *data)
{
__u32 *array = data;
array[key] = value;
}
void dcb_set_u64(__u32 key, __u64 value, void *data)
{
__u64 *array = data;
array[key] = value;
}
int dcb_cmd_parse_dev(struct dcb *dcb, int argc, char **argv,
int (*and_then)(struct dcb *dcb, const char *dev,
int argc, char **argv),
void (*help)(void))
{
const char *dev;
if (!argc || matches(*argv, "help") == 0) {
help();
return 0;
} else if (matches(*argv, "dev") == 0) {
NEXT_ARG();
dev = *argv;
if (check_ifname(dev)) {
invarg("not a valid ifname", *argv);
return -EINVAL;
}
NEXT_ARG_FWD();
return and_then(dcb, dev, argc, argv);
} else {
fprintf(stderr, "Expected `dev DEV', not `%s'", *argv);
help();
return -EINVAL;
}
}
static void dcb_help(void)
{
fprintf(stderr,
"Usage: dcb [ OPTIONS ] OBJECT { COMMAND | help }\n"
" dcb [ -f | --force ] { -b | --batch } filename [ -n | --netns ] netnsname\n"
"where OBJECT := { app | buffer | dcbx | ets | maxrate | pfc }\n"
" OPTIONS := [ -V | --Version | -i | --iec | -j | --json\n"
" | -N | --Numeric | -p | --pretty\n"
" | -s | --statistics | -v | --verbose]\n");
}
static int dcb_cmd(struct dcb *dcb, int argc, char **argv)
{
if (!argc || matches(*argv, "help") == 0) {
dcb_help();
return 0;
} else if (matches(*argv, "app") == 0) {
return dcb_cmd_app(dcb, argc - 1, argv + 1);
} else if (matches(*argv, "buffer") == 0) {
return dcb_cmd_buffer(dcb, argc - 1, argv + 1);
} else if (matches(*argv, "dcbx") == 0) {
return dcb_cmd_dcbx(dcb, argc - 1, argv + 1);
} else if (matches(*argv, "ets") == 0) {
return dcb_cmd_ets(dcb, argc - 1, argv + 1);
} else if (matches(*argv, "maxrate") == 0) {
return dcb_cmd_maxrate(dcb, argc - 1, argv + 1);
} else if (matches(*argv, "pfc") == 0) {
return dcb_cmd_pfc(dcb, argc - 1, argv + 1);
}
fprintf(stderr, "Object \"%s\" is unknown\n", *argv);
return -ENOENT;
}
static int dcb_batch_cmd(int argc, char *argv[], void *data)
{
struct dcb *dcb = data;
return dcb_cmd(dcb, argc, argv);
}
static int dcb_batch(struct dcb *dcb, const char *name, bool force)
{
return do_batch(name, force, dcb_batch_cmd, dcb);
}
int main(int argc, char **argv)
{
static const struct option long_options[] = {
{ "Version", no_argument, NULL, 'V' },
{ "force", no_argument, NULL, 'f' },
{ "batch", required_argument, NULL, 'b' },
{ "iec", no_argument, NULL, 'i' },
{ "json", no_argument, NULL, 'j' },
{ "Numeric", no_argument, NULL, 'N' },
{ "pretty", no_argument, NULL, 'p' },
{ "statistics", no_argument, NULL, 's' },
{ "netns", required_argument, NULL, 'n' },
{ "help", no_argument, NULL, 'h' },
{ NULL, 0, NULL, 0 }
};
const char *batch_file = NULL;
bool force = false;
struct dcb *dcb;
int opt;
int err;
int ret;
dcb = dcb_alloc();
if (!dcb) {
fprintf(stderr, "Failed to allocate memory for dcb\n");
return EXIT_FAILURE;
}
while ((opt = getopt_long(argc, argv, "b:fhijn:psvNV",
long_options, NULL)) >= 0) {
switch (opt) {
case 'V':
printf("dcb utility, iproute2-%s\n", version);
ret = EXIT_SUCCESS;
goto dcb_free;
case 'f':
force = true;
break;
case 'b':
batch_file = optarg;
break;
case 'j':
dcb->json_output = true;
break;
case 'N':
dcb->numeric = true;
break;
case 'p':
pretty = true;
break;
case 's':
dcb->stats = true;
break;
case 'n':
if (netns_switch(optarg)) {
ret = EXIT_FAILURE;
goto dcb_free;
}
break;
case 'i':
dcb->use_iec = true;
break;
case 'h':
dcb_help();
ret = EXIT_SUCCESS;
goto dcb_free;
default:
fprintf(stderr, "Unknown option.\n");
dcb_help();
ret = EXIT_FAILURE;
goto dcb_free;
}
}
argc -= optind;
argv += optind;
err = dcb_init(dcb);
if (err) {
ret = EXIT_FAILURE;
goto dcb_free;
}
if (batch_file)
err = dcb_batch(dcb, batch_file, force);
else
err = dcb_cmd(dcb, argc, argv);
if (err) {
ret = EXIT_FAILURE;
goto dcb_fini;
}
ret = EXIT_SUCCESS;
dcb_fini:
dcb_fini(dcb);
dcb_free:
dcb_free(dcb);
return ret;
}

View File

@ -1,81 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __DCB_H__
#define __DCB_H__ 1
#include <libmnl/libmnl.h>
#include <stdbool.h>
#include <stddef.h>
/* dcb.c */
struct dcb {
char *buf;
struct mnl_socket *nl;
bool json_output;
bool stats;
bool use_iec;
bool numeric;
};
int dcb_parse_mapping(const char *what_key, __u32 key, __u32 max_key,
const char *what_value, __u64 value, __u64 max_value,
void (*set_array)(__u32 index, __u64 value, void *data),
void *set_array_data);
int dcb_cmd_parse_dev(struct dcb *dcb, int argc, char **argv,
int (*and_then)(struct dcb *dcb, const char *dev,
int argc, char **argv),
void (*help)(void));
void dcb_set_u8(__u32 key, __u64 value, void *data);
void dcb_set_u32(__u32 key, __u64 value, void *data);
void dcb_set_u64(__u32 key, __u64 value, void *data);
int dcb_get_attribute(struct dcb *dcb, const char *dev, int attr,
void *data, size_t data_len);
int dcb_set_attribute(struct dcb *dcb, const char *dev, int attr,
const void *data, size_t data_len);
int dcb_get_attribute_va(struct dcb *dcb, const char *dev, int attr,
void **payload_p, __u16 *payload_len_p);
int dcb_set_attribute_va(struct dcb *dcb, int command, const char *dev,
int (*cb)(struct dcb *dcb, struct nlmsghdr *nlh, void *data),
void *data);
int dcb_get_attribute_bare(struct dcb *dcb, int cmd, const char *dev, int attr,
void **payload_p, __u16 *payload_len_p);
int dcb_set_attribute_bare(struct dcb *dcb, int command, const char *dev,
int attr, const void *data, size_t data_len,
int response_attr);
void dcb_print_named_array(const char *json_name, const char *fp_name,
const __u8 *array, size_t size,
void (*print_array)(const __u8 *, size_t));
void dcb_print_array_u8(const __u8 *array, size_t size);
void dcb_print_array_u64(const __u64 *array, size_t size);
void dcb_print_array_on_off(const __u8 *array, size_t size);
void dcb_print_array_kw(const __u8 *array, size_t array_size,
const char *const kw[], size_t kw_size);
/* dcb_app.c */
int dcb_cmd_app(struct dcb *dcb, int argc, char **argv);
/* dcb_buffer.c */
int dcb_cmd_buffer(struct dcb *dcb, int argc, char **argv);
/* dcb_dcbx.c */
int dcb_cmd_dcbx(struct dcb *dcb, int argc, char **argv);
/* dcb_ets.c */
int dcb_cmd_ets(struct dcb *dcb, int argc, char **argv);
/* dcb_maxrate.c */
int dcb_cmd_maxrate(struct dcb *dcb, int argc, char **argv);
/* dcb_pfc.c */
int dcb_cmd_pfc(struct dcb *dcb, int argc, char **argv);
#endif /* __DCB_H__ */

View File

@ -1,795 +0,0 @@
// SPDX-License-Identifier: GPL-2.0+
#include <errno.h>
#include <inttypes.h>
#include <stdio.h>
#include <libmnl/libmnl.h>
#include <linux/dcbnl.h>
#include "dcb.h"
#include "utils.h"
#include "rt_names.h"
static void dcb_app_help_add(void)
{
fprintf(stderr,
"Usage: dcb app { add | del | replace } dev STRING\n"
" [ default-prio PRIO ]\n"
" [ ethtype-prio ET:PRIO ]\n"
" [ stream-port-prio PORT:PRIO ]\n"
" [ dgram-port-prio PORT:PRIO ]\n"
" [ port-prio PORT:PRIO ]\n"
" [ dscp-prio INTEGER:PRIO ]\n"
"\n"
" where PRIO := { 0 .. 7 }\n"
" ET := { 0x600 .. 0xffff }\n"
" PORT := { 1 .. 65535 }\n"
" DSCP := { 0 .. 63 }\n"
"\n"
);
}
static void dcb_app_help_show_flush(void)
{
fprintf(stderr,
"Usage: dcb app { show | flush } dev STRING\n"
" [ default-prio ]\n"
" [ ethtype-prio ]\n"
" [ stream-port-prio ]\n"
" [ dgram-port-prio ]\n"
" [ port-prio ]\n"
" [ dscp-prio ]\n"
"\n"
);
}
static void dcb_app_help(void)
{
fprintf(stderr,
"Usage: dcb app help\n"
"\n"
);
dcb_app_help_show_flush();
dcb_app_help_add();
}
struct dcb_app_table {
struct dcb_app *apps;
size_t n_apps;
};
static void dcb_app_table_fini(struct dcb_app_table *tab)
{
free(tab->apps);
}
static int dcb_app_table_push(struct dcb_app_table *tab, struct dcb_app *app)
{
struct dcb_app *apps = realloc(tab->apps, (tab->n_apps + 1) * sizeof(*tab->apps));
if (apps == NULL) {
perror("Cannot allocate APP table");
return -ENOMEM;
}
tab->apps = apps;
tab->apps[tab->n_apps++] = *app;
return 0;
}
static void dcb_app_table_remove_existing(struct dcb_app_table *a,
const struct dcb_app_table *b)
{
size_t ia, ja;
size_t ib;
for (ia = 0, ja = 0; ia < a->n_apps; ia++) {
struct dcb_app *aa = &a->apps[ia];
bool found = false;
for (ib = 0; ib < b->n_apps; ib++) {
const struct dcb_app *ab = &b->apps[ib];
if (aa->selector == ab->selector &&
aa->protocol == ab->protocol &&
aa->priority == ab->priority) {
found = true;
break;
}
}
if (!found)
a->apps[ja++] = *aa;
}
a->n_apps = ja;
}
static void dcb_app_table_remove_replaced(struct dcb_app_table *a,
const struct dcb_app_table *b)
{
size_t ia, ja;
size_t ib;
for (ia = 0, ja = 0; ia < a->n_apps; ia++) {
struct dcb_app *aa = &a->apps[ia];
bool present = false;
bool found = false;
for (ib = 0; ib < b->n_apps; ib++) {
const struct dcb_app *ab = &b->apps[ib];
if (aa->selector == ab->selector &&
aa->protocol == ab->protocol)
present = true;
else
continue;
if (aa->priority == ab->priority) {
found = true;
break;
}
}
/* Entries that remain in A will be removed, so keep in the
* table only APP entries whose sel/pid is mentioned in B,
* but that do not have the full sel/pid/prio match.
*/
if (present && !found)
a->apps[ja++] = *aa;
}
a->n_apps = ja;
}
static int dcb_app_table_copy(struct dcb_app_table *a,
const struct dcb_app_table *b)
{
size_t i;
int ret;
for (i = 0; i < b->n_apps; i++) {
ret = dcb_app_table_push(a, &b->apps[i]);
if (ret != 0)
return ret;
}
return 0;
}
static int dcb_app_cmp(const struct dcb_app *a, const struct dcb_app *b)
{
if (a->protocol < b->protocol)
return -1;
if (a->protocol > b->protocol)
return 1;
return a->priority - b->priority;
}
static int dcb_app_cmp_cb(const void *a, const void *b)
{
return dcb_app_cmp(a, b);
}
static void dcb_app_table_sort(struct dcb_app_table *tab)
{
qsort(tab->apps, tab->n_apps, sizeof(*tab->apps), dcb_app_cmp_cb);
}
struct dcb_app_parse_mapping {
__u8 selector;
struct dcb_app_table *tab;
int err;
};
static void dcb_app_parse_mapping_cb(__u32 key, __u64 value, void *data)
{
struct dcb_app_parse_mapping *pm = data;
struct dcb_app app = {
.selector = pm->selector,
.priority = value,
.protocol = key,
};
if (pm->err)
return;
pm->err = dcb_app_table_push(pm->tab, &app);
}
static int dcb_app_parse_mapping_ethtype_prio(__u32 key, char *value, void *data)
{
__u8 prio;
if (key < 0x600) {
fprintf(stderr, "Protocol IDs < 0x600 are reserved for EtherType\n");
return -EINVAL;
}
if (get_u8(&prio, value, 0))
return -EINVAL;
return dcb_parse_mapping("ETHTYPE", key, 0xffff,
"PRIO", prio, IEEE_8021QAZ_MAX_TCS - 1,
dcb_app_parse_mapping_cb, data);
}
static int dcb_app_parse_dscp(__u32 *key, const char *arg)
{
if (parse_mapping_num_all(key, arg) == 0)
return 0;
if (rtnl_dsfield_a2n(key, arg) != 0)
return -1;
if (*key & 0x03) {
fprintf(stderr, "The values `%s' uses non-DSCP bits.\n", arg);
return -1;
}
/* Unshift the value to convert it from dsfield to DSCP. */
*key >>= 2;
return 0;
}
static int dcb_app_parse_mapping_dscp_prio(__u32 key, char *value, void *data)
{
__u8 prio;
if (get_u8(&prio, value, 0))
return -EINVAL;
return dcb_parse_mapping("DSCP", key, 63,
"PRIO", prio, IEEE_8021QAZ_MAX_TCS - 1,
dcb_app_parse_mapping_cb, data);
}
static int dcb_app_parse_mapping_port_prio(__u32 key, char *value, void *data)
{
__u8 prio;
if (key == 0) {
fprintf(stderr, "Port ID of 0 is invalid\n");
return -EINVAL;
}
if (get_u8(&prio, value, 0))
return -EINVAL;
return dcb_parse_mapping("PORT", key, 0xffff,
"PRIO", prio, IEEE_8021QAZ_MAX_TCS - 1,
dcb_app_parse_mapping_cb, data);
}
static int dcb_app_parse_default_prio(int *argcp, char ***argvp, struct dcb_app_table *tab)
{
int argc = *argcp;
char **argv = *argvp;
int ret = 0;
while (argc > 0) {
struct dcb_app app;
__u8 prio;
if (get_u8(&prio, *argv, 0)) {
ret = 1;
break;
}
app = (struct dcb_app){
.selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE,
.protocol = 0,
.priority = prio,
};
ret = dcb_app_table_push(tab, &app);
if (ret != 0)
break;
argc--, argv++;
}
*argcp = argc;
*argvp = argv;
return ret;
}
static bool dcb_app_is_ethtype(const struct dcb_app *app)
{
return app->selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE &&
app->protocol != 0;
}
static bool dcb_app_is_default(const struct dcb_app *app)
{
return app->selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE &&
app->protocol == 0;
}
static bool dcb_app_is_dscp(const struct dcb_app *app)
{
return app->selector == IEEE_8021QAZ_APP_SEL_DSCP;
}
static bool dcb_app_is_stream_port(const struct dcb_app *app)
{
return app->selector == IEEE_8021QAZ_APP_SEL_STREAM;
}
static bool dcb_app_is_dgram_port(const struct dcb_app *app)
{
return app->selector == IEEE_8021QAZ_APP_SEL_DGRAM;
}
static bool dcb_app_is_port(const struct dcb_app *app)
{
return app->selector == IEEE_8021QAZ_APP_SEL_ANY;
}
static int dcb_app_print_key_dec(__u16 protocol)
{
return print_uint(PRINT_ANY, NULL, "%d:", protocol);
}
static int dcb_app_print_key_hex(__u16 protocol)
{
return print_uint(PRINT_ANY, NULL, "%x:", protocol);
}
static int dcb_app_print_key_dscp(__u16 protocol)
{
const char *name = rtnl_dsfield_get_name(protocol << 2);
if (!is_json_context() && name != NULL)
return print_string(PRINT_FP, NULL, "%s:", name);
return print_uint(PRINT_ANY, NULL, "%d:", protocol);
}
static void dcb_app_print_filtered(const struct dcb_app_table *tab,
bool (*filter)(const struct dcb_app *),
int (*print_key)(__u16 protocol),
const char *json_name,
const char *fp_name)
{
bool first = true;
size_t i;
for (i = 0; i < tab->n_apps; i++) {
struct dcb_app *app = &tab->apps[i];
if (!filter(app))
continue;
if (first) {
open_json_array(PRINT_JSON, json_name);
print_string(PRINT_FP, NULL, "%s ", fp_name);
first = false;
}
open_json_array(PRINT_JSON, NULL);
print_key(app->protocol);
print_uint(PRINT_ANY, NULL, "%d ", app->priority);
close_json_array(PRINT_JSON, NULL);
}
if (!first) {
close_json_array(PRINT_JSON, json_name);
print_nl();
}
}
static void dcb_app_print_ethtype_prio(const struct dcb_app_table *tab)
{
dcb_app_print_filtered(tab, dcb_app_is_ethtype, dcb_app_print_key_hex,
"ethtype_prio", "ethtype-prio");
}
static void dcb_app_print_dscp_prio(const struct dcb *dcb,
const struct dcb_app_table *tab)
{
dcb_app_print_filtered(tab, dcb_app_is_dscp,
dcb->numeric ? dcb_app_print_key_dec
: dcb_app_print_key_dscp,
"dscp_prio", "dscp-prio");
}
static void dcb_app_print_stream_port_prio(const struct dcb_app_table *tab)
{
dcb_app_print_filtered(tab, dcb_app_is_stream_port, dcb_app_print_key_dec,
"stream_port_prio", "stream-port-prio");
}
static void dcb_app_print_dgram_port_prio(const struct dcb_app_table *tab)
{
dcb_app_print_filtered(tab, dcb_app_is_dgram_port, dcb_app_print_key_dec,
"dgram_port_prio", "dgram-port-prio");
}
static void dcb_app_print_port_prio(const struct dcb_app_table *tab)
{
dcb_app_print_filtered(tab, dcb_app_is_port, dcb_app_print_key_dec,
"port_prio", "port-prio");
}
static void dcb_app_print_default_prio(const struct dcb_app_table *tab)
{
bool first = true;
size_t i;
for (i = 0; i < tab->n_apps; i++) {
if (!dcb_app_is_default(&tab->apps[i]))
continue;
if (first) {
open_json_array(PRINT_JSON, "default_prio");
print_string(PRINT_FP, NULL, "default-prio ", NULL);
first = false;
}
print_uint(PRINT_ANY, NULL, "%d ", tab->apps[i].priority);
}
if (!first) {
close_json_array(PRINT_JSON, "default_prio");
print_nl();
}
}
static void dcb_app_print(const struct dcb *dcb, const struct dcb_app_table *tab)
{
dcb_app_print_ethtype_prio(tab);
dcb_app_print_default_prio(tab);
dcb_app_print_dscp_prio(dcb, tab);
dcb_app_print_stream_port_prio(tab);
dcb_app_print_dgram_port_prio(tab);
dcb_app_print_port_prio(tab);
}
static int dcb_app_get_table_attr_cb(const struct nlattr *attr, void *data)
{
struct dcb_app_table *tab = data;
struct dcb_app *app;
int ret;
if (mnl_attr_get_type(attr) != DCB_ATTR_IEEE_APP) {
fprintf(stderr, "Unknown attribute in DCB_ATTR_IEEE_APP_TABLE: %d\n",
mnl_attr_get_type(attr));
return MNL_CB_OK;
}
if (mnl_attr_get_payload_len(attr) < sizeof(struct dcb_app)) {
fprintf(stderr, "DCB_ATTR_IEEE_APP payload expected to have size %zd, not %d\n",
sizeof(struct dcb_app), mnl_attr_get_payload_len(attr));
return MNL_CB_OK;
}
app = mnl_attr_get_payload(attr);
ret = dcb_app_table_push(tab, app);
if (ret != 0)
return MNL_CB_ERROR;
return MNL_CB_OK;
}
static int dcb_app_get(struct dcb *dcb, const char *dev, struct dcb_app_table *tab)
{
uint16_t payload_len;
void *payload;
int ret;
ret = dcb_get_attribute_va(dcb, dev, DCB_ATTR_IEEE_APP_TABLE, &payload, &payload_len);
if (ret != 0)
return ret;
ret = mnl_attr_parse_payload(payload, payload_len, dcb_app_get_table_attr_cb, tab);
if (ret != MNL_CB_OK)
return -EINVAL;
return 0;
}
struct dcb_app_add_del {
const struct dcb_app_table *tab;
bool (*filter)(const struct dcb_app *app);
};
static int dcb_app_add_del_cb(struct dcb *dcb, struct nlmsghdr *nlh, void *data)
{
struct dcb_app_add_del *add_del = data;
struct nlattr *nest;
size_t i;
nest = mnl_attr_nest_start(nlh, DCB_ATTR_IEEE_APP_TABLE);
for (i = 0; i < add_del->tab->n_apps; i++) {
const struct dcb_app *app = &add_del->tab->apps[i];
if (add_del->filter == NULL || add_del->filter(app))
mnl_attr_put(nlh, DCB_ATTR_IEEE_APP, sizeof(*app), app);
}
mnl_attr_nest_end(nlh, nest);
return 0;
}
static int dcb_app_add_del(struct dcb *dcb, const char *dev, int command,
const struct dcb_app_table *tab,
bool (*filter)(const struct dcb_app *))
{
struct dcb_app_add_del add_del = {
.tab = tab,
.filter = filter,
};
if (tab->n_apps == 0)
return 0;
return dcb_set_attribute_va(dcb, command, dev, dcb_app_add_del_cb, &add_del);
}
static int dcb_cmd_app_parse_add_del(struct dcb *dcb, const char *dev,
int argc, char **argv, struct dcb_app_table *tab)
{
struct dcb_app_parse_mapping pm = {
.tab = tab,
};
int ret;
if (!argc) {
dcb_app_help_add();
return 0;
}
do {
if (matches(*argv, "help") == 0) {
dcb_app_help_add();
return 0;
} else if (matches(*argv, "ethtype-prio") == 0) {
NEXT_ARG();
pm.selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE;
ret = parse_mapping(&argc, &argv, false,
&dcb_app_parse_mapping_ethtype_prio,
&pm);
} else if (matches(*argv, "default-prio") == 0) {
NEXT_ARG();
ret = dcb_app_parse_default_prio(&argc, &argv, pm.tab);
if (ret != 0) {
fprintf(stderr, "Invalid default priority %s\n", *argv);
return ret;
}
} else if (matches(*argv, "dscp-prio") == 0) {
NEXT_ARG();
pm.selector = IEEE_8021QAZ_APP_SEL_DSCP;
ret = parse_mapping_gen(&argc, &argv,
&dcb_app_parse_dscp,
&dcb_app_parse_mapping_dscp_prio,
&pm);
} else if (matches(*argv, "stream-port-prio") == 0) {
NEXT_ARG();
pm.selector = IEEE_8021QAZ_APP_SEL_STREAM;
ret = parse_mapping(&argc, &argv, false,
&dcb_app_parse_mapping_port_prio,
&pm);
} else if (matches(*argv, "dgram-port-prio") == 0) {
NEXT_ARG();
pm.selector = IEEE_8021QAZ_APP_SEL_DGRAM;
ret = parse_mapping(&argc, &argv, false,
&dcb_app_parse_mapping_port_prio,
&pm);
} else if (matches(*argv, "port-prio") == 0) {
NEXT_ARG();
pm.selector = IEEE_8021QAZ_APP_SEL_ANY;
ret = parse_mapping(&argc, &argv, false,
&dcb_app_parse_mapping_port_prio,
&pm);
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_app_help_add();
return -EINVAL;
}
if (ret != 0) {
fprintf(stderr, "Invalid mapping %s\n", *argv);
return ret;
}
if (pm.err)
return pm.err;
} while (argc > 0);
return 0;
}
static int dcb_cmd_app_add(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct dcb_app_table tab = {};
int ret;
ret = dcb_cmd_app_parse_add_del(dcb, dev, argc, argv, &tab);
if (ret != 0)
return ret;
ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_SET, &tab, NULL);
dcb_app_table_fini(&tab);
return ret;
}
static int dcb_cmd_app_del(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct dcb_app_table tab = {};
int ret;
ret = dcb_cmd_app_parse_add_del(dcb, dev, argc, argv, &tab);
if (ret != 0)
return ret;
ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_DEL, &tab, NULL);
dcb_app_table_fini(&tab);
return ret;
}
static int dcb_cmd_app_show(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct dcb_app_table tab = {};
int ret;
ret = dcb_app_get(dcb, dev, &tab);
if (ret != 0)
return ret;
dcb_app_table_sort(&tab);
open_json_object(NULL);
if (!argc) {
dcb_app_print(dcb, &tab);
goto out;
}
do {
if (matches(*argv, "help") == 0) {
dcb_app_help_show_flush();
goto out;
} else if (matches(*argv, "ethtype-prio") == 0) {
dcb_app_print_ethtype_prio(&tab);
} else if (matches(*argv, "dscp-prio") == 0) {
dcb_app_print_dscp_prio(dcb, &tab);
} else if (matches(*argv, "stream-port-prio") == 0) {
dcb_app_print_stream_port_prio(&tab);
} else if (matches(*argv, "dgram-port-prio") == 0) {
dcb_app_print_dgram_port_prio(&tab);
} else if (matches(*argv, "port-prio") == 0) {
dcb_app_print_port_prio(&tab);
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_app_help_show_flush();
ret = -EINVAL;
goto out;
}
NEXT_ARG_FWD();
} while (argc > 0);
out:
close_json_object();
dcb_app_table_fini(&tab);
return ret;
}
static int dcb_cmd_app_flush(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct dcb_app_table tab = {};
int ret;
ret = dcb_app_get(dcb, dev, &tab);
if (ret != 0)
return ret;
if (!argc) {
ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_DEL, &tab, NULL);
goto out;
}
do {
if (matches(*argv, "help") == 0) {
dcb_app_help_show_flush();
goto out;
} else if (matches(*argv, "ethtype-prio") == 0) {
ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_DEL, &tab,
&dcb_app_is_ethtype);
if (ret != 0)
goto out;
} else if (matches(*argv, "default-prio") == 0) {
ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_DEL, &tab,
&dcb_app_is_default);
if (ret != 0)
goto out;
} else if (matches(*argv, "dscp-prio") == 0) {
ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_DEL, &tab,
&dcb_app_is_dscp);
if (ret != 0)
goto out;
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_app_help_show_flush();
ret = -EINVAL;
goto out;
}
NEXT_ARG_FWD();
} while (argc > 0);
out:
dcb_app_table_fini(&tab);
return ret;
}
static int dcb_cmd_app_replace(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct dcb_app_table orig = {};
struct dcb_app_table tab = {};
struct dcb_app_table new = {};
int ret;
ret = dcb_app_get(dcb, dev, &orig);
if (ret != 0)
return ret;
ret = dcb_cmd_app_parse_add_del(dcb, dev, argc, argv, &tab);
if (ret != 0)
goto out;
/* Attempts to add an existing entry would be rejected, so drop
* these entries from tab.
*/
ret = dcb_app_table_copy(&new, &tab);
if (ret != 0)
goto out;
dcb_app_table_remove_existing(&new, &orig);
ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_SET, &new, NULL);
if (ret != 0) {
fprintf(stderr, "Could not add new APP entries\n");
goto out;
}
/* Remove the obsolete entries. */
dcb_app_table_remove_replaced(&orig, &tab);
ret = dcb_app_add_del(dcb, dev, DCB_CMD_IEEE_DEL, &orig, NULL);
if (ret != 0) {
fprintf(stderr, "Could not remove replaced APP entries\n");
goto out;
}
out:
dcb_app_table_fini(&new);
dcb_app_table_fini(&tab);
dcb_app_table_fini(&orig);
return 0;
}
int dcb_cmd_app(struct dcb *dcb, int argc, char **argv)
{
if (!argc || matches(*argv, "help") == 0) {
dcb_app_help();
return 0;
} else if (matches(*argv, "show") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_app_show, dcb_app_help_show_flush);
} else if (matches(*argv, "flush") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_app_flush, dcb_app_help_show_flush);
} else if (matches(*argv, "add") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_app_add, dcb_app_help_add);
} else if (matches(*argv, "del") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_app_del, dcb_app_help_add);
} else if (matches(*argv, "replace") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_app_replace, dcb_app_help_add);
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_app_help();
return -EINVAL;
}
}

View File

@ -1,235 +0,0 @@
// SPDX-License-Identifier: GPL-2.0+
#include <errno.h>
#include <inttypes.h>
#include <stdio.h>
#include <linux/dcbnl.h>
#include "dcb.h"
#include "utils.h"
static void dcb_buffer_help_set(void)
{
fprintf(stderr,
"Usage: dcb buffer set dev STRING\n"
" [ prio-buffer PRIO-MAP ]\n"
" [ buffer-size SIZE-MAP ]\n"
"\n"
" where PRIO-MAP := [ PRIO-MAP ] PRIO-MAPPING\n"
" PRIO-MAPPING := { all | PRIO }:BUFFER\n"
" SIZE-MAP := [ SIZE-MAP ] SIZE-MAPPING\n"
" SIZE-MAPPING := { all | BUFFER }:INTEGER\n"
" PRIO := { 0 .. 7 }\n"
" BUFFER := { 0 .. 7 }\n"
"\n"
);
}
static void dcb_buffer_help_show(void)
{
fprintf(stderr,
"Usage: dcb buffer show dev STRING\n"
" [ prio-buffer ] [ buffer-size ] [ total-size ]\n"
"\n"
);
}
static void dcb_buffer_help(void)
{
fprintf(stderr,
"Usage: dcb buffer help\n"
"\n"
);
dcb_buffer_help_show();
dcb_buffer_help_set();
}
static int dcb_buffer_parse_mapping_prio_buffer(__u32 key, char *value, void *data)
{
struct dcbnl_buffer *buffer = data;
__u8 buf;
if (get_u8(&buf, value, 0))
return -EINVAL;
return dcb_parse_mapping("PRIO", key, IEEE_8021Q_MAX_PRIORITIES - 1,
"BUFFER", buf, DCBX_MAX_BUFFERS - 1,
dcb_set_u8, buffer->prio2buffer);
}
static int dcb_buffer_parse_mapping_buffer_size(__u32 key, char *value, void *data)
{
struct dcbnl_buffer *buffer = data;
unsigned int size;
if (get_size(&size, value)) {
fprintf(stderr, "%d:%s: Illegal value for buffer size\n", key, value);
return -EINVAL;
}
return dcb_parse_mapping("BUFFER", key, DCBX_MAX_BUFFERS - 1,
"INTEGER", size, -1,
dcb_set_u32, buffer->buffer_size);
}
static void dcb_buffer_print_total_size(const struct dcbnl_buffer *buffer)
{
print_size(PRINT_ANY, "total_size", "total-size %s ", buffer->total_size);
}
static void dcb_buffer_print_prio_buffer(const struct dcbnl_buffer *buffer)
{
dcb_print_named_array("prio_buffer", "prio-buffer",
buffer->prio2buffer, ARRAY_SIZE(buffer->prio2buffer),
dcb_print_array_u8);
}
static void dcb_buffer_print_buffer_size(const struct dcbnl_buffer *buffer)
{
size_t size = ARRAY_SIZE(buffer->buffer_size);
SPRINT_BUF(b);
size_t i;
open_json_array(PRINT_JSON, "buffer_size");
print_string(PRINT_FP, NULL, "buffer-size ", NULL);
for (i = 0; i < size; i++) {
snprintf(b, sizeof(b), "%zd:%%s ", i);
print_size(PRINT_ANY, NULL, b, buffer->buffer_size[i]);
}
close_json_array(PRINT_JSON, "buffer_size");
}
static void dcb_buffer_print(const struct dcbnl_buffer *buffer)
{
dcb_buffer_print_prio_buffer(buffer);
print_nl();
dcb_buffer_print_buffer_size(buffer);
print_nl();
dcb_buffer_print_total_size(buffer);
print_nl();
}
static int dcb_buffer_get(struct dcb *dcb, const char *dev, struct dcbnl_buffer *buffer)
{
return dcb_get_attribute(dcb, dev, DCB_ATTR_DCB_BUFFER, buffer, sizeof(*buffer));
}
static int dcb_buffer_set(struct dcb *dcb, const char *dev, const struct dcbnl_buffer *buffer)
{
return dcb_set_attribute(dcb, dev, DCB_ATTR_DCB_BUFFER, buffer, sizeof(*buffer));
}
static int dcb_cmd_buffer_set(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct dcbnl_buffer buffer;
int ret;
if (!argc) {
dcb_buffer_help_set();
return 0;
}
ret = dcb_buffer_get(dcb, dev, &buffer);
if (ret)
return ret;
do {
if (matches(*argv, "help") == 0) {
dcb_buffer_help_set();
return 0;
} else if (matches(*argv, "prio-buffer") == 0) {
NEXT_ARG();
ret = parse_mapping(&argc, &argv, true,
&dcb_buffer_parse_mapping_prio_buffer, &buffer);
if (ret) {
fprintf(stderr, "Invalid priority mapping %s\n", *argv);
return ret;
}
continue;
} else if (matches(*argv, "buffer-size") == 0) {
NEXT_ARG();
ret = parse_mapping(&argc, &argv, true,
&dcb_buffer_parse_mapping_buffer_size, &buffer);
if (ret) {
fprintf(stderr, "Invalid buffer size mapping %s\n", *argv);
return ret;
}
continue;
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_buffer_help_set();
return -EINVAL;
}
NEXT_ARG_FWD();
} while (argc > 0);
return dcb_buffer_set(dcb, dev, &buffer);
}
static int dcb_cmd_buffer_show(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct dcbnl_buffer buffer;
int ret;
ret = dcb_buffer_get(dcb, dev, &buffer);
if (ret)
return ret;
open_json_object(NULL);
if (!argc) {
dcb_buffer_print(&buffer);
goto out;
}
do {
if (matches(*argv, "help") == 0) {
dcb_buffer_help_show();
return 0;
} else if (matches(*argv, "prio-buffer") == 0) {
dcb_buffer_print_prio_buffer(&buffer);
print_nl();
} else if (matches(*argv, "buffer-size") == 0) {
dcb_buffer_print_buffer_size(&buffer);
print_nl();
} else if (matches(*argv, "total-size") == 0) {
dcb_buffer_print_total_size(&buffer);
print_nl();
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_buffer_help_show();
return -EINVAL;
}
NEXT_ARG_FWD();
} while (argc > 0);
out:
close_json_object();
return 0;
}
int dcb_cmd_buffer(struct dcb *dcb, int argc, char **argv)
{
if (!argc || matches(*argv, "help") == 0) {
dcb_buffer_help();
return 0;
} else if (matches(*argv, "show") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_buffer_show, dcb_buffer_help_show);
} else if (matches(*argv, "set") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_buffer_set, dcb_buffer_help_set);
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_buffer_help();
return -EINVAL;
}
}

View File

@ -1,192 +0,0 @@
// SPDX-License-Identifier: GPL-2.0+
#include <errno.h>
#include <inttypes.h>
#include <stdio.h>
#include <linux/dcbnl.h>
#include "dcb.h"
#include "utils.h"
static void dcb_dcbx_help_set(void)
{
fprintf(stderr,
"Usage: dcb dcbx set dev STRING\n"
" [ host | lld-managed ]\n"
" [ cee | ieee ] [ static ]\n"
"\n"
);
}
static void dcb_dcbx_help_show(void)
{
fprintf(stderr,
"Usage: dcb dcbx show dev STRING\n"
"\n"
);
}
static void dcb_dcbx_help(void)
{
fprintf(stderr,
"Usage: dcb dcbx help\n"
"\n"
);
dcb_dcbx_help_show();
dcb_dcbx_help_set();
}
struct dcb_dcbx_flag {
__u8 value;
const char *key_fp;
const char *key_json;
};
static struct dcb_dcbx_flag dcb_dcbx_flags[] = {
{DCB_CAP_DCBX_HOST, "host"},
{DCB_CAP_DCBX_LLD_MANAGED, "lld-managed", "lld_managed"},
{DCB_CAP_DCBX_VER_CEE, "cee"},
{DCB_CAP_DCBX_VER_IEEE, "ieee"},
{DCB_CAP_DCBX_STATIC, "static"},
};
static void dcb_dcbx_print(__u8 dcbx)
{
int bit;
int i;
while ((bit = ffs(dcbx))) {
bool found = false;
bit--;
for (i = 0; i < ARRAY_SIZE(dcb_dcbx_flags); i++) {
struct dcb_dcbx_flag *flag = &dcb_dcbx_flags[i];
if (flag->value == 1 << bit) {
print_bool(PRINT_JSON, flag->key_json ?: flag->key_fp,
NULL, true);
print_string(PRINT_FP, NULL, "%s ", flag->key_fp);
found = true;
break;
}
}
if (!found)
fprintf(stderr, "Unknown DCBX bit %#x.\n", 1 << bit);
dcbx &= ~(1 << bit);
}
print_nl();
}
static int dcb_dcbx_get(struct dcb *dcb, const char *dev, __u8 *dcbx)
{
__u16 payload_len;
void *payload;
int err;
err = dcb_get_attribute_bare(dcb, DCB_CMD_IEEE_GET, dev, DCB_ATTR_DCBX,
&payload, &payload_len);
if (err != 0)
return err;
if (payload_len != 1) {
fprintf(stderr, "DCB_ATTR_DCBX payload has size %d, expected 1.\n",
payload_len);
return -EINVAL;
}
*dcbx = *(__u8 *) payload;
return 0;
}
static int dcb_dcbx_set(struct dcb *dcb, const char *dev, __u8 dcbx)
{
return dcb_set_attribute_bare(dcb, DCB_CMD_SDCBX, dev, DCB_ATTR_DCBX,
&dcbx, 1, DCB_ATTR_DCBX);
}
static int dcb_cmd_dcbx_set(struct dcb *dcb, const char *dev, int argc, char **argv)
{
__u8 dcbx = 0;
__u8 i;
if (!argc) {
dcb_dcbx_help_set();
return 0;
}
do {
if (matches(*argv, "help") == 0) {
dcb_dcbx_help_set();
return 0;
}
for (i = 0; i < ARRAY_SIZE(dcb_dcbx_flags); i++) {
struct dcb_dcbx_flag *flag = &dcb_dcbx_flags[i];
if (matches(*argv, flag->key_fp) == 0) {
dcbx |= flag->value;
NEXT_ARG_FWD();
goto next;
}
}
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_dcbx_help_set();
return -EINVAL;
next:
;
} while (argc > 0);
return dcb_dcbx_set(dcb, dev, dcbx);
}
static int dcb_cmd_dcbx_show(struct dcb *dcb, const char *dev, int argc, char **argv)
{
__u8 dcbx;
int ret;
ret = dcb_dcbx_get(dcb, dev, &dcbx);
if (ret != 0)
return ret;
while (argc > 0) {
if (matches(*argv, "help") == 0) {
dcb_dcbx_help_show();
return 0;
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_dcbx_help_show();
return -EINVAL;
}
NEXT_ARG_FWD();
}
open_json_object(NULL);
dcb_dcbx_print(dcbx);
close_json_object();
return 0;
}
int dcb_cmd_dcbx(struct dcb *dcb, int argc, char **argv)
{
if (!argc || matches(*argv, "help") == 0) {
dcb_dcbx_help();
return 0;
} else if (matches(*argv, "show") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_dcbx_show, dcb_dcbx_help_show);
} else if (matches(*argv, "set") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_dcbx_set, dcb_dcbx_help_set);
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_dcbx_help();
return -EINVAL;
}
}

View File

@ -1,435 +0,0 @@
// SPDX-License-Identifier: GPL-2.0+
#include <errno.h>
#include <stdio.h>
#include <linux/dcbnl.h>
#include "dcb.h"
#include "utils.h"
static void dcb_ets_help_set(void)
{
fprintf(stderr,
"Usage: dcb ets set dev STRING\n"
" [ willing { on | off } ]\n"
" [ { tc-tsa | reco-tc-tsa } TSA-MAP ]\n"
" [ { pg-bw | tc-bw | reco-tc-bw } BW-MAP ]\n"
" [ { prio-tc | reco-prio-tc } PRIO-MAP ]\n"
"\n"
" where TSA-MAP := [ TSA-MAP ] TSA-MAPPING\n"
" TSA-MAPPING := { all | TC }:{ strict | cbs | ets | vendor }\n"
" BW-MAP := [ BW-MAP ] BW-MAPPING\n"
" BW-MAPPING := { all | TC }:INTEGER\n"
" PRIO-MAP := [ PRIO-MAP ] PRIO-MAPPING\n"
" PRIO-MAPPING := { all | PRIO }:TC\n"
" TC := { 0 .. 7 }\n"
" PRIO := { 0 .. 7 }\n"
"\n"
);
}
static void dcb_ets_help_show(void)
{
fprintf(stderr,
"Usage: dcb ets show dev STRING\n"
" [ willing ] [ ets-cap ] [ cbs ] [ tc-tsa ]\n"
" [ reco-tc-tsa ] [ pg-bw ] [ tc-bw ] [ reco-tc-bw ]\n"
" [ prio-tc ] [ reco-prio-tc ]\n"
"\n"
);
}
static void dcb_ets_help(void)
{
fprintf(stderr,
"Usage: dcb ets help\n"
"\n"
);
dcb_ets_help_show();
dcb_ets_help_set();
}
static const char *const tsa_names[] = {
[IEEE_8021QAZ_TSA_STRICT] = "strict",
[IEEE_8021QAZ_TSA_CB_SHAPER] = "cbs",
[IEEE_8021QAZ_TSA_ETS] = "ets",
[IEEE_8021QAZ_TSA_VENDOR] = "vendor",
};
static int dcb_ets_parse_mapping_tc_tsa(__u32 key, char *value, void *data)
{
__u8 tsa;
int ret;
tsa = parse_one_of("TSA", value, tsa_names, ARRAY_SIZE(tsa_names), &ret);
if (ret)
return ret;
return dcb_parse_mapping("TC", key, IEEE_8021QAZ_MAX_TCS - 1,
"TSA", tsa, -1U,
dcb_set_u8, data);
}
static int dcb_ets_parse_mapping_tc_bw(__u32 key, char *value, void *data)
{
__u8 bw;
if (get_u8(&bw, value, 0))
return -EINVAL;
return dcb_parse_mapping("TC", key, IEEE_8021QAZ_MAX_TCS - 1,
"BW", bw, 100,
dcb_set_u8, data);
}
static int dcb_ets_parse_mapping_prio_tc(unsigned int key, char *value, void *data)
{
__u8 tc;
if (get_u8(&tc, value, 0))
return -EINVAL;
return dcb_parse_mapping("PRIO", key, IEEE_8021QAZ_MAX_TCS - 1,
"TC", tc, IEEE_8021QAZ_MAX_TCS - 1,
dcb_set_u8, data);
}
static void dcb_print_array_tsa(const __u8 *array, size_t size)
{
dcb_print_array_kw(array, size, tsa_names, ARRAY_SIZE(tsa_names));
}
static void dcb_ets_print_willing(const struct ieee_ets *ets)
{
print_on_off(PRINT_ANY, "willing", "willing %s ", ets->willing);
}
static void dcb_ets_print_ets_cap(const struct ieee_ets *ets)
{
print_uint(PRINT_ANY, "ets_cap", "ets-cap %d ", ets->ets_cap);
}
static void dcb_ets_print_cbs(const struct ieee_ets *ets)
{
print_on_off(PRINT_ANY, "cbs", "cbs %s ", ets->cbs);
}
static void dcb_ets_print_tc_bw(const struct ieee_ets *ets)
{
dcb_print_named_array("tc_bw", "tc-bw",
ets->tc_tx_bw, ARRAY_SIZE(ets->tc_tx_bw),
dcb_print_array_u8);
}
static void dcb_ets_print_pg_bw(const struct ieee_ets *ets)
{
dcb_print_named_array("pg_bw", "pg-bw",
ets->tc_rx_bw, ARRAY_SIZE(ets->tc_rx_bw),
dcb_print_array_u8);
}
static void dcb_ets_print_tc_tsa(const struct ieee_ets *ets)
{
dcb_print_named_array("tc_tsa", "tc-tsa",
ets->tc_tsa, ARRAY_SIZE(ets->tc_tsa),
dcb_print_array_tsa);
}
static void dcb_ets_print_prio_tc(const struct ieee_ets *ets)
{
dcb_print_named_array("prio_tc", "prio-tc",
ets->prio_tc, ARRAY_SIZE(ets->prio_tc),
dcb_print_array_u8);
}
static void dcb_ets_print_reco_tc_bw(const struct ieee_ets *ets)
{
dcb_print_named_array("reco_tc_bw", "reco-tc-bw",
ets->tc_reco_bw, ARRAY_SIZE(ets->tc_reco_bw),
dcb_print_array_u8);
}
static void dcb_ets_print_reco_tc_tsa(const struct ieee_ets *ets)
{
dcb_print_named_array("reco_tc_tsa", "reco-tc-tsa",
ets->tc_reco_tsa, ARRAY_SIZE(ets->tc_reco_tsa),
dcb_print_array_tsa);
}
static void dcb_ets_print_reco_prio_tc(const struct ieee_ets *ets)
{
dcb_print_named_array("reco_prio_tc", "reco-prio-tc",
ets->reco_prio_tc, ARRAY_SIZE(ets->reco_prio_tc),
dcb_print_array_u8);
}
static void dcb_ets_print(const struct ieee_ets *ets)
{
dcb_ets_print_willing(ets);
dcb_ets_print_ets_cap(ets);
dcb_ets_print_cbs(ets);
print_nl();
dcb_ets_print_tc_bw(ets);
print_nl();
dcb_ets_print_pg_bw(ets);
print_nl();
dcb_ets_print_tc_tsa(ets);
print_nl();
dcb_ets_print_prio_tc(ets);
print_nl();
dcb_ets_print_reco_tc_bw(ets);
print_nl();
dcb_ets_print_reco_tc_tsa(ets);
print_nl();
dcb_ets_print_reco_prio_tc(ets);
print_nl();
}
static int dcb_ets_get(struct dcb *dcb, const char *dev, struct ieee_ets *ets)
{
return dcb_get_attribute(dcb, dev, DCB_ATTR_IEEE_ETS, ets, sizeof(*ets));
}
static int dcb_ets_validate_bw(const __u8 bw[], const __u8 tsa[], const char *what)
{
bool has_ets = false;
unsigned int total = 0;
unsigned int tc;
for (tc = 0; tc < IEEE_8021QAZ_MAX_TCS; tc++) {
if (tsa[tc] == IEEE_8021QAZ_TSA_ETS) {
has_ets = true;
break;
}
}
/* TC bandwidth is only intended for ETS, but 802.1Q-2018 only requires
* that the sum be 100, and individual entries 0..100. It explicitly
* notes that non-ETS TCs can have non-0 TC bandwidth during
* reconfiguration.
*/
for (tc = 0; tc < IEEE_8021QAZ_MAX_TCS; tc++) {
if (bw[tc] > 100) {
fprintf(stderr, "%d%% for TC %d of %s is not a valid bandwidth percentage, expected 0..100%%\n",
bw[tc], tc, what);
return -EINVAL;
}
total += bw[tc];
}
/* This is what 802.1Q-2018 requires. */
if (total == 100)
return 0;
/* But this requirement does not make sense for all-strict
* configurations. Anything else than 0 does not make sense: either BW
* has not been reconfigured for the all-strict allocation yet, at which
* point we expect sum of 100. Or it has already been reconfigured, at
* which point accept 0.
*/
if (!has_ets && total == 0)
return 0;
fprintf(stderr, "Bandwidth percentages in %s sum to %d%%, expected %d%%\n",
what, total, has_ets ? 100 : 0);
return -EINVAL;
}
static int dcb_ets_set(struct dcb *dcb, const char *dev, const struct ieee_ets *ets)
{
/* Do not validate pg-bw, which is not standard and has unclear
* meaning.
*/
if (dcb_ets_validate_bw(ets->tc_tx_bw, ets->tc_tsa, "tc-bw") ||
dcb_ets_validate_bw(ets->tc_reco_bw, ets->tc_reco_tsa, "reco-tc-bw"))
return -EINVAL;
return dcb_set_attribute(dcb, dev, DCB_ATTR_IEEE_ETS, ets, sizeof(*ets));
}
static int dcb_cmd_ets_set(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct ieee_ets ets;
int ret;
if (!argc) {
dcb_ets_help_set();
return 1;
}
ret = dcb_ets_get(dcb, dev, &ets);
if (ret)
return ret;
do {
if (matches(*argv, "help") == 0) {
dcb_ets_help_set();
return 0;
} else if (matches(*argv, "willing") == 0) {
NEXT_ARG();
ets.willing = parse_on_off("willing", *argv, &ret);
if (ret)
return ret;
} else if (matches(*argv, "tc-tsa") == 0) {
NEXT_ARG();
ret = parse_mapping(&argc, &argv, true, &dcb_ets_parse_mapping_tc_tsa,
ets.tc_tsa);
if (ret) {
fprintf(stderr, "Invalid tc-tsa mapping %s\n", *argv);
return ret;
}
continue;
} else if (matches(*argv, "reco-tc-tsa") == 0) {
NEXT_ARG();
ret = parse_mapping(&argc, &argv, true, &dcb_ets_parse_mapping_tc_tsa,
ets.tc_reco_tsa);
if (ret) {
fprintf(stderr, "Invalid reco-tc-tsa mapping %s\n", *argv);
return ret;
}
continue;
} else if (matches(*argv, "tc-bw") == 0) {
NEXT_ARG();
ret = parse_mapping(&argc, &argv, true, &dcb_ets_parse_mapping_tc_bw,
ets.tc_tx_bw);
if (ret) {
fprintf(stderr, "Invalid tc-bw mapping %s\n", *argv);
return ret;
}
continue;
} else if (matches(*argv, "pg-bw") == 0) {
NEXT_ARG();
ret = parse_mapping(&argc, &argv, true, &dcb_ets_parse_mapping_tc_bw,
ets.tc_rx_bw);
if (ret) {
fprintf(stderr, "Invalid pg-bw mapping %s\n", *argv);
return ret;
}
continue;
} else if (matches(*argv, "reco-tc-bw") == 0) {
NEXT_ARG();
ret = parse_mapping(&argc, &argv, true, &dcb_ets_parse_mapping_tc_bw,
ets.tc_reco_bw);
if (ret) {
fprintf(stderr, "Invalid reco-tc-bw mapping %s\n", *argv);
return ret;
}
continue;
} else if (matches(*argv, "prio-tc") == 0) {
NEXT_ARG();
ret = parse_mapping(&argc, &argv, true, &dcb_ets_parse_mapping_prio_tc,
ets.prio_tc);
if (ret) {
fprintf(stderr, "Invalid prio-tc mapping %s\n", *argv);
return ret;
}
continue;
} else if (matches(*argv, "reco-prio-tc") == 0) {
NEXT_ARG();
ret = parse_mapping(&argc, &argv, true, &dcb_ets_parse_mapping_prio_tc,
ets.reco_prio_tc);
if (ret) {
fprintf(stderr, "Invalid reco-prio-tc mapping %s\n", *argv);
return ret;
}
continue;
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_ets_help_set();
return -EINVAL;
}
NEXT_ARG_FWD();
} while (argc > 0);
return dcb_ets_set(dcb, dev, &ets);
}
static int dcb_cmd_ets_show(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct ieee_ets ets;
int ret;
ret = dcb_ets_get(dcb, dev, &ets);
if (ret)
return ret;
open_json_object(NULL);
if (!argc) {
dcb_ets_print(&ets);
goto out;
}
do {
if (matches(*argv, "help") == 0) {
dcb_ets_help_show();
return 0;
} else if (matches(*argv, "willing") == 0) {
dcb_ets_print_willing(&ets);
print_nl();
} else if (matches(*argv, "ets-cap") == 0) {
dcb_ets_print_ets_cap(&ets);
print_nl();
} else if (matches(*argv, "cbs") == 0) {
dcb_ets_print_cbs(&ets);
print_nl();
} else if (matches(*argv, "tc-tsa") == 0) {
dcb_ets_print_tc_tsa(&ets);
print_nl();
} else if (matches(*argv, "reco-tc-tsa") == 0) {
dcb_ets_print_reco_tc_tsa(&ets);
print_nl();
} else if (matches(*argv, "tc-bw") == 0) {
dcb_ets_print_tc_bw(&ets);
print_nl();
} else if (matches(*argv, "pg-bw") == 0) {
dcb_ets_print_pg_bw(&ets);
print_nl();
} else if (matches(*argv, "reco-tc-bw") == 0) {
dcb_ets_print_reco_tc_bw(&ets);
print_nl();
} else if (matches(*argv, "prio-tc") == 0) {
dcb_ets_print_prio_tc(&ets);
print_nl();
} else if (matches(*argv, "reco-prio-tc") == 0) {
dcb_ets_print_reco_prio_tc(&ets);
print_nl();
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_ets_help_show();
return -EINVAL;
}
NEXT_ARG_FWD();
} while (argc > 0);
out:
close_json_object();
return 0;
}
int dcb_cmd_ets(struct dcb *dcb, int argc, char **argv)
{
if (!argc || matches(*argv, "help") == 0) {
dcb_ets_help();
return 0;
} else if (matches(*argv, "show") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv, dcb_cmd_ets_show, dcb_ets_help_show);
} else if (matches(*argv, "set") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv, dcb_cmd_ets_set, dcb_ets_help_set);
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_ets_help();
return -EINVAL;
}
}

View File

@ -1,182 +0,0 @@
// SPDX-License-Identifier: GPL-2.0+
#include <errno.h>
#include <inttypes.h>
#include <stdio.h>
#include <linux/dcbnl.h>
#include "dcb.h"
#include "utils.h"
static void dcb_maxrate_help_set(void)
{
fprintf(stderr,
"Usage: dcb maxrate set dev STRING\n"
" [ tc-maxrate RATE-MAP ]\n"
"\n"
" where RATE-MAP := [ RATE-MAP ] RATE-MAPPING\n"
" RATE-MAPPING := { all | TC }:RATE\n"
" TC := { 0 .. 7 }\n"
"\n"
);
}
static void dcb_maxrate_help_show(void)
{
fprintf(stderr,
"Usage: dcb [ -i ] maxrate show dev STRING\n"
" [ tc-maxrate ]\n"
"\n"
);
}
static void dcb_maxrate_help(void)
{
fprintf(stderr,
"Usage: dcb maxrate help\n"
"\n"
);
dcb_maxrate_help_show();
dcb_maxrate_help_set();
}
static int dcb_maxrate_parse_mapping_tc_maxrate(__u32 key, char *value, void *data)
{
__u64 rate;
if (get_rate64(&rate, value))
return -EINVAL;
return dcb_parse_mapping("TC", key, IEEE_8021QAZ_MAX_TCS - 1,
"RATE", rate, -1,
dcb_set_u64, data);
}
static void dcb_maxrate_print_tc_maxrate(struct dcb *dcb, const struct ieee_maxrate *maxrate)
{
size_t size = ARRAY_SIZE(maxrate->tc_maxrate);
SPRINT_BUF(b);
size_t i;
open_json_array(PRINT_JSON, "tc_maxrate");
print_string(PRINT_FP, NULL, "tc-maxrate ", NULL);
for (i = 0; i < size; i++) {
snprintf(b, sizeof(b), "%zd:%%s ", i);
print_rate(dcb->use_iec, PRINT_ANY, NULL, b, maxrate->tc_maxrate[i]);
}
close_json_array(PRINT_JSON, "tc_maxrate");
}
static void dcb_maxrate_print(struct dcb *dcb, const struct ieee_maxrate *maxrate)
{
dcb_maxrate_print_tc_maxrate(dcb, maxrate);
print_nl();
}
static int dcb_maxrate_get(struct dcb *dcb, const char *dev, struct ieee_maxrate *maxrate)
{
return dcb_get_attribute(dcb, dev, DCB_ATTR_IEEE_MAXRATE, maxrate, sizeof(*maxrate));
}
static int dcb_maxrate_set(struct dcb *dcb, const char *dev, const struct ieee_maxrate *maxrate)
{
return dcb_set_attribute(dcb, dev, DCB_ATTR_IEEE_MAXRATE, maxrate, sizeof(*maxrate));
}
static int dcb_cmd_maxrate_set(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct ieee_maxrate maxrate;
int ret;
if (!argc) {
dcb_maxrate_help_set();
return 0;
}
ret = dcb_maxrate_get(dcb, dev, &maxrate);
if (ret)
return ret;
do {
if (matches(*argv, "help") == 0) {
dcb_maxrate_help_set();
return 0;
} else if (matches(*argv, "tc-maxrate") == 0) {
NEXT_ARG();
ret = parse_mapping(&argc, &argv, true,
&dcb_maxrate_parse_mapping_tc_maxrate, &maxrate);
if (ret) {
fprintf(stderr, "Invalid mapping %s\n", *argv);
return ret;
}
continue;
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_maxrate_help_set();
return -EINVAL;
}
NEXT_ARG_FWD();
} while (argc > 0);
return dcb_maxrate_set(dcb, dev, &maxrate);
}
static int dcb_cmd_maxrate_show(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct ieee_maxrate maxrate;
int ret;
ret = dcb_maxrate_get(dcb, dev, &maxrate);
if (ret)
return ret;
open_json_object(NULL);
if (!argc) {
dcb_maxrate_print(dcb, &maxrate);
goto out;
}
do {
if (matches(*argv, "help") == 0) {
dcb_maxrate_help_show();
return 0;
} else if (matches(*argv, "tc-maxrate") == 0) {
dcb_maxrate_print_tc_maxrate(dcb, &maxrate);
print_nl();
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_maxrate_help_show();
return -EINVAL;
}
NEXT_ARG_FWD();
} while (argc > 0);
out:
close_json_object();
return 0;
}
int dcb_cmd_maxrate(struct dcb *dcb, int argc, char **argv)
{
if (!argc || matches(*argv, "help") == 0) {
dcb_maxrate_help();
return 0;
} else if (matches(*argv, "show") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_maxrate_show, dcb_maxrate_help_show);
} else if (matches(*argv, "set") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_maxrate_set, dcb_maxrate_help_set);
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_maxrate_help();
return -EINVAL;
}
}

View File

@ -1,286 +0,0 @@
// SPDX-License-Identifier: GPL-2.0+
#include <errno.h>
#include <stdio.h>
#include <linux/dcbnl.h>
#include "dcb.h"
#include "utils.h"
static void dcb_pfc_help_set(void)
{
fprintf(stderr,
"Usage: dcb pfc set dev STRING\n"
" [ prio-pfc PFC-MAP ]\n"
" [ macsec-bypass { on | off } ]\n"
" [ delay INTEGER ]\n"
"\n"
" where PFC-MAP := [ PFC-MAP ] PFC-MAPPING\n"
" PFC-MAPPING := { all | TC }:PFC\n"
" TC := { 0 .. 7 }\n"
" PFC := { on | off }\n"
"\n"
);
}
static void dcb_pfc_help_show(void)
{
fprintf(stderr,
"Usage: dcb [ -s ] pfc show dev STRING\n"
" [ pfc-cap ] [ prio-pfc ] [ macsec-bypass ]\n"
" [ delay ] [ requests ] [ indications ]\n"
"\n"
);
}
static void dcb_pfc_help(void)
{
fprintf(stderr,
"Usage: dcb pfc help\n"
"\n"
);
dcb_pfc_help_show();
dcb_pfc_help_set();
}
static void dcb_pfc_to_array(__u8 array[IEEE_8021QAZ_MAX_TCS], __u8 pfc_en)
{
int i;
for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
array[i] = !!(pfc_en & (1 << i));
}
static void dcb_pfc_from_array(__u8 array[IEEE_8021QAZ_MAX_TCS], __u8 *pfc_en_p)
{
__u8 pfc_en = 0;
int i;
for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
if (array[i])
pfc_en |= 1 << i;
}
*pfc_en_p = pfc_en;
}
static int dcb_pfc_parse_mapping_prio_pfc(__u32 key, char *value, void *data)
{
struct ieee_pfc *pfc = data;
__u8 pfc_en[IEEE_8021QAZ_MAX_TCS];
bool enabled;
int ret;
dcb_pfc_to_array(pfc_en, pfc->pfc_en);
enabled = parse_on_off("PFC", value, &ret);
if (ret)
return ret;
ret = dcb_parse_mapping("PRIO", key, IEEE_8021QAZ_MAX_TCS - 1,
"PFC", enabled, -1,
dcb_set_u8, pfc_en);
if (ret)
return ret;
dcb_pfc_from_array(pfc_en, &pfc->pfc_en);
return 0;
}
static void dcb_pfc_print_pfc_cap(const struct ieee_pfc *pfc)
{
print_uint(PRINT_ANY, "pfc_cap", "pfc-cap %d ", pfc->pfc_cap);
}
static void dcb_pfc_print_macsec_bypass(const struct ieee_pfc *pfc)
{
print_on_off(PRINT_ANY, "macsec_bypass", "macsec-bypass %s ", pfc->mbc);
}
static void dcb_pfc_print_delay(const struct ieee_pfc *pfc)
{
print_uint(PRINT_ANY, "delay", "delay %d ", pfc->delay);
}
static void dcb_pfc_print_prio_pfc(const struct ieee_pfc *pfc)
{
__u8 pfc_en[IEEE_8021QAZ_MAX_TCS];
dcb_pfc_to_array(pfc_en, pfc->pfc_en);
dcb_print_named_array("prio_pfc", "prio-pfc",
pfc_en, ARRAY_SIZE(pfc_en), &dcb_print_array_on_off);
}
static void dcb_pfc_print_requests(const struct ieee_pfc *pfc)
{
open_json_array(PRINT_JSON, "requests");
print_string(PRINT_FP, NULL, "requests ", NULL);
dcb_print_array_u64(pfc->requests, ARRAY_SIZE(pfc->requests));
close_json_array(PRINT_JSON, "requests");
}
static void dcb_pfc_print_indications(const struct ieee_pfc *pfc)
{
open_json_array(PRINT_JSON, "indications");
print_string(PRINT_FP, NULL, "indications ", NULL);
dcb_print_array_u64(pfc->indications, ARRAY_SIZE(pfc->indications));
close_json_array(PRINT_JSON, "indications");
}
static void dcb_pfc_print(const struct dcb *dcb, const struct ieee_pfc *pfc)
{
dcb_pfc_print_pfc_cap(pfc);
dcb_pfc_print_macsec_bypass(pfc);
dcb_pfc_print_delay(pfc);
print_nl();
dcb_pfc_print_prio_pfc(pfc);
print_nl();
if (dcb->stats) {
dcb_pfc_print_requests(pfc);
print_nl();
dcb_pfc_print_indications(pfc);
print_nl();
}
}
static int dcb_pfc_get(struct dcb *dcb, const char *dev, struct ieee_pfc *pfc)
{
return dcb_get_attribute(dcb, dev, DCB_ATTR_IEEE_PFC, pfc, sizeof(*pfc));
}
static int dcb_pfc_set(struct dcb *dcb, const char *dev, const struct ieee_pfc *pfc)
{
return dcb_set_attribute(dcb, dev, DCB_ATTR_IEEE_PFC, pfc, sizeof(*pfc));
}
static int dcb_cmd_pfc_set(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct ieee_pfc pfc;
int ret;
if (!argc) {
dcb_pfc_help_set();
return 0;
}
ret = dcb_pfc_get(dcb, dev, &pfc);
if (ret)
return ret;
do {
if (matches(*argv, "help") == 0) {
dcb_pfc_help_set();
return 0;
} else if (matches(*argv, "prio-pfc") == 0) {
NEXT_ARG();
ret = parse_mapping(&argc, &argv, true,
&dcb_pfc_parse_mapping_prio_pfc, &pfc);
if (ret) {
fprintf(stderr, "Invalid pfc mapping %s\n", *argv);
return ret;
}
continue;
} else if (matches(*argv, "macsec-bypass") == 0) {
NEXT_ARG();
pfc.mbc = parse_on_off("macsec-bypass", *argv, &ret);
if (ret)
return ret;
} else if (matches(*argv, "delay") == 0) {
NEXT_ARG();
/* Do not support the size notations for delay.
* Delay is specified in "bit times", not bits, so
* it is not applicable. At the same time it would
* be confusing that 10Kbit does not mean 10240,
* but 1280.
*/
if (get_u16(&pfc.delay, *argv, 0)) {
fprintf(stderr, "Invalid delay `%s', expected an integer 0..65535\n",
*argv);
return -EINVAL;
}
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_pfc_help_set();
return -EINVAL;
}
NEXT_ARG_FWD();
} while (argc > 0);
return dcb_pfc_set(dcb, dev, &pfc);
}
static int dcb_cmd_pfc_show(struct dcb *dcb, const char *dev, int argc, char **argv)
{
struct ieee_pfc pfc;
int ret;
ret = dcb_pfc_get(dcb, dev, &pfc);
if (ret)
return ret;
open_json_object(NULL);
if (!argc) {
dcb_pfc_print(dcb, &pfc);
goto out;
}
do {
if (matches(*argv, "help") == 0) {
dcb_pfc_help_show();
return 0;
} else if (matches(*argv, "prio-pfc") == 0) {
dcb_pfc_print_prio_pfc(&pfc);
print_nl();
} else if (matches(*argv, "pfc-cap") == 0) {
dcb_pfc_print_pfc_cap(&pfc);
print_nl();
} else if (matches(*argv, "macsec-bypass") == 0) {
dcb_pfc_print_macsec_bypass(&pfc);
print_nl();
} else if (matches(*argv, "delay") == 0) {
dcb_pfc_print_delay(&pfc);
print_nl();
} else if (matches(*argv, "requests") == 0) {
dcb_pfc_print_requests(&pfc);
print_nl();
} else if (matches(*argv, "indications") == 0) {
dcb_pfc_print_indications(&pfc);
print_nl();
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_pfc_help_show();
return -EINVAL;
}
NEXT_ARG_FWD();
} while (argc > 0);
out:
close_json_object();
return 0;
}
int dcb_cmd_pfc(struct dcb *dcb, int argc, char **argv)
{
if (!argc || matches(*argv, "help") == 0) {
dcb_pfc_help();
return 0;
} else if (matches(*argv, "show") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_pfc_show, dcb_pfc_help_show);
} else if (matches(*argv, "set") == 0) {
NEXT_ARG_FWD();
return dcb_cmd_parse_dev(dcb, argc, argv,
dcb_cmd_pfc_set, dcb_pfc_help_set);
} else {
fprintf(stderr, "What is \"%s\"?\n", *argv);
dcb_pfc_help();
return -EINVAL;
}
}

1
devlink/.gitignore vendored
View File

@ -1 +0,0 @@
devlink

View File

@ -1,25 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
include ../config.mk
TARGETS :=
ifeq ($(HAVE_MNL),y)
DEVLINKOBJ = devlink.o mnlg.o
TARGETS += devlink
LDLIBS += -lm
endif
all: $(TARGETS) $(LIBS)
devlink: $(DEVLINKOBJ) $(LIBNETLINK)
$(QUIET_LINK)$(CC) $^ $(LDFLAGS) $(LDLIBS) -o $@
install: all
for i in $(TARGETS); \
do install -m 0755 $$i $(DESTDIR)$(SBINDIR); \
done
clean:
rm -f $(DEVLINKOBJ) $(TARGETS)

File diff suppressed because it is too large Load Diff

View File

@ -1,155 +0,0 @@
/*
* mnlg.c Generic Netlink helpers for libmnl
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Jiri Pirko <jiri@mellanox.com>
*/
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <libmnl/libmnl.h>
#include <linux/genetlink.h>
#include "libnetlink.h"
#include "mnl_utils.h"
#include "utils.h"
#include "mnlg.h"
struct mnlg_socket {
struct mnl_socket *nl;
char *buf;
uint32_t id;
uint8_t version;
unsigned int seq;
};
int mnlg_socket_send(struct mnlu_gen_socket *nlg, const struct nlmsghdr *nlh)
{
return mnl_socket_sendto(nlg->nl, nlh, nlh->nlmsg_len);
}
struct group_info {
bool found;
uint32_t id;
const char *name;
};
static int parse_mc_grps_cb(const struct nlattr *attr, void *data)
{
const struct nlattr **tb = data;
int type = mnl_attr_get_type(attr);
if (mnl_attr_type_valid(attr, CTRL_ATTR_MCAST_GRP_MAX) < 0)
return MNL_CB_OK;
switch (type) {
case CTRL_ATTR_MCAST_GRP_ID:
if (mnl_attr_validate(attr, MNL_TYPE_U32) < 0)
return MNL_CB_ERROR;
break;
case CTRL_ATTR_MCAST_GRP_NAME:
if (mnl_attr_validate(attr, MNL_TYPE_STRING) < 0)
return MNL_CB_ERROR;
break;
}
tb[type] = attr;
return MNL_CB_OK;
}
static void parse_genl_mc_grps(struct nlattr *nested,
struct group_info *group_info)
{
struct nlattr *pos;
const char *name;
mnl_attr_for_each_nested(pos, nested) {
struct nlattr *tb[CTRL_ATTR_MCAST_GRP_MAX + 1] = {};
mnl_attr_parse_nested(pos, parse_mc_grps_cb, tb);
if (!tb[CTRL_ATTR_MCAST_GRP_NAME] ||
!tb[CTRL_ATTR_MCAST_GRP_ID])
continue;
name = mnl_attr_get_str(tb[CTRL_ATTR_MCAST_GRP_NAME]);
if (strcmp(name, group_info->name) != 0)
continue;
group_info->id = mnl_attr_get_u32(tb[CTRL_ATTR_MCAST_GRP_ID]);
group_info->found = true;
}
}
static int get_group_id_attr_cb(const struct nlattr *attr, void *data)
{
const struct nlattr **tb = data;
int type = mnl_attr_get_type(attr);
if (mnl_attr_type_valid(attr, CTRL_ATTR_MAX) < 0)
return MNL_CB_ERROR;
if (type == CTRL_ATTR_MCAST_GROUPS &&
mnl_attr_validate(attr, MNL_TYPE_NESTED) < 0)
return MNL_CB_ERROR;
tb[type] = attr;
return MNL_CB_OK;
}
static int get_group_id_cb(const struct nlmsghdr *nlh, void *data)
{
struct group_info *group_info = data;
struct nlattr *tb[CTRL_ATTR_MAX + 1] = {};
struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh);
mnl_attr_parse(nlh, sizeof(*genl), get_group_id_attr_cb, tb);
if (!tb[CTRL_ATTR_MCAST_GROUPS])
return MNL_CB_ERROR;
parse_genl_mc_grps(tb[CTRL_ATTR_MCAST_GROUPS], group_info);
return MNL_CB_OK;
}
int mnlg_socket_group_add(struct mnlu_gen_socket *nlg, const char *group_name)
{
struct nlmsghdr *nlh;
struct group_info group_info;
int err;
nlh = _mnlu_gen_socket_cmd_prepare(nlg, CTRL_CMD_GETFAMILY,
NLM_F_REQUEST | NLM_F_ACK,
GENL_ID_CTRL, 1);
mnl_attr_put_u16(nlh, CTRL_ATTR_FAMILY_ID, nlg->family);
err = mnlg_socket_send(nlg, nlh);
if (err < 0)
return err;
group_info.found = false;
group_info.name = group_name;
err = mnlu_gen_socket_recv_run(nlg, get_group_id_cb, &group_info);
if (err < 0)
return err;
if (!group_info.found) {
errno = ENOENT;
return -1;
}
err = mnl_socket_setsockopt(nlg->nl, NETLINK_ADD_MEMBERSHIP,
&group_info.id, sizeof(group_info.id));
if (err < 0)
return err;
return 0;
}
int mnlg_socket_get_fd(struct mnlu_gen_socket *nlg)
{
return mnl_socket_get_fd(nlg->nl);
}

View File

@ -1,23 +0,0 @@
/*
* mnlg.h Generic Netlink helpers for libmnl
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Jiri Pirko <jiri@mellanox.com>
*/
#ifndef _MNLG_H_
#define _MNLG_H_
#include <libmnl/libmnl.h>
struct mnlu_gen_socket;
int mnlg_socket_send(struct mnlu_gen_socket *nlg, const struct nlmsghdr *nlh);
int mnlg_socket_group_add(struct mnlu_gen_socket *nlg, const char *group_name);
int mnlg_socket_get_fd(struct mnlu_gen_socket *nlg);
#endif /* _MNLG_H_ */

73
doc/Makefile Normal file
View File

@ -0,0 +1,73 @@
PSFILES=ip-cref.ps ip-tunnels.ps api-ip6-flowlabels.ps ss.ps nstat.ps arpd.ps rtstat.ps
# tc-cref.ps
# api-rtnl.tex api-pmtudisc.tex api-news.tex
# iki-netdev.ps iki-neighdst.ps
LATEX=latex
DVIPS=dvips
SGML2DVI=sgml2latex
SGML2HTML=sgml2html -s 0
LPR=lpr -Zsduplex
SHELL=bash
PAGESIZE=a4
PAGESPERPAGE=2
HTMLFILES=$(subst .sgml,.html,$(shell echo *.sgml))
DVIFILES=$(subst .ps,.dvi,$(PSFILES))
PDFFILES=$(subst .ps,.pdf,$(PSFILES))
all: pstwocol
pstwocol: $(PSFILES)
html: $(HTMLFILES)
dvi: $(DVIFILES)
pdf: $(PDFFILES)
print: $(PSFILES)
$(LPR) $(PSFILES)
%.tex: %.sgml
$(SGML2DVI) --output=tex $<
%.dvi: %.sgml
$(SGML2DVI) --output=dvi $<
%.dvi: %.tex
@set -e; pass=2; echo "Running LaTeX $<"; \
while [ `$(LATEX) $< </dev/null 2>&1 | \
grep -c '^\(LaTeX Warning: Label(s) may\|No file \|! Emergency stop\)'` -ge 1 ]; do \
if [ $$pass -gt 3 ]; then \
echo "Seems, something is wrong. Try by hands." ; exit 1 ; \
fi; \
echo "Re-running LaTeX $<, $${pass}d pass"; pass=$$[$$pass + 1]; \
done
%.pdf: %.tex
@set -e; pass=2; echo "Running pdfLaTeX $<"; \
while [ `pdflatex $< </dev/null 2>&1 | \
grep -c '^\(LaTeX Warning: Label(s) may\|No file \|! Emergency stop\)'` -ge 1 ]; do \
if [ $$pass -gt 3 ]; then \
echo "Seems, something is wrong. Try by hands." ; exit 1 ; \
fi; \
echo "Re-running pdfLaTeX $<, $${pass}d pass"; pass=$$[$$pass + 1]; \
done
#%.pdf: %.ps
# ps2pdf $<
%.ps: %.dvi
$(DVIPS) $< -o $@
%.html: %.sgml
$(SGML2HTML) $<
install:
install -m 0644 $(shell echo *.tex) $(DESTDIR)$(DOCDIR)
install -m 0644 $(shell echo *.sgml) $(DESTDIR)$(DOCDIR)
clean:
rm -f *.aux *.log *.toc $(PSFILES) $(DVIFILES) *.html *.pdf

16
doc/Plan Normal file
View File

@ -0,0 +1,16 @@
Partially finished work.
1. User Reference manuals.
1.1 IP Command reference (ip-cref.tex, published)
1.2 TC Command reference (tc-cref.tex)
1.3 IP tunnels (ip-tunnels.tex, published)
2. Linux-2.2 Networking API
2.1 RTNETLINK (api-rtnl.tex)
2.2 Path MTU Discovery (api-pmtudisc.tex)
2.3 IPv6 Flow Labels (api-ip6-flowlabels.tex, published)
2.4 Miscellaneous extensions (api-misc.tex)
3. Linux-2.2 Networking Intra-Kernel Interfaces
3.1 NetDev --- Networking Devices and netdev... (iki-netdev.tex)
3.2 Neighbour cache and destination cache. (iki-neighdst.tex)

1
doc/SNAPSHOT.tex Normal file
View File

@ -0,0 +1 @@
\def\Draft{020116}

View File

@ -6,8 +6,8 @@ What is it?
-----------
An extension to the filtering/classification architecture of Linux Traffic
Control.
Up to 2.6.8 the only action that could be "attached" to a filter was policing.
Control.
Up to 2.6.8 the only action that could be "attached" to a filter was policing.
i.e you could say something like:
-----
@ -17,11 +17,11 @@ tc filter add dev lo parent ffff: protocol ip prio 10 u32 match ip src \
which implies "if a packet is seen on the ingress of the lo device with
a source IP address of 127.0.0.1/32 we give it a classification id of 1:1 and
we execute a policing action which rate limits its bandwidth utilization
we execute a policing action which rate limits its bandwidth utilization
to 1.5Mbps".
The new extensions allow for more than just policing actions to be added.
They are also fully backward compatible. If you have a kernel that doesn't
They are also fully backward compatible. If you have a kernel that doesnt
understand them, then the effect is null i.e if you have a newer tc
but older kernel, the actions are not installed. Likewise if you
have a newer kernel but older tc, obviously the tc will use current
@ -29,9 +29,9 @@ syntax which will work fine. Of course to get the required effect you need
both newer tc and kernel. If you are reading this you have the
right tc ;->
A side effect is that we can now get stateless firewalling to work with tc.
A side effect is that we can now get stateless firewalling to work with tc.
Essentially this is now an alternative to iptables.
I won't go into details of my dislike for iptables at times, but
I wont go into details of my dislike for iptables at times, but
scalability is one of the main issues; however, if you need stateful
classification - use netfilter (for now).
@ -61,7 +61,7 @@ tc filter add dev lo parent 1:0 protocol ip prio 10 u32 \
match ip src 127.0.0.1/32 flowid 1:1 \
action police mtu 4000 rate 1500kbit burst 90k
" generic Actions" (gact) at the moment are:
" generic Actions" (gact) at the moment are:
{ drop, pass, reclassify, continue}
(If you have others, no listed here give me a reason and we will add them)
+drop says to drop the packet
@ -77,7 +77,7 @@ iptable target. I have only tested with mangler targets up to now.
In terms of hooks:
*ingress is mapped to pre-routing hook
*egress is mapped to post-routing hook
I don't see much value in the other hooks, if you see it and email me good
I dont see much value in the other hooks, if you see it and email me good
reasons, the addition is trivial.
Example syntax for iptables targets usage becomes:
@ -93,43 +93,43 @@ decimal 12, then use flowid 1:c.
3) A feature i call pipe
The motivation is derived from Unix pipe mechanism but applied to packets.
Essentially take a matching packet and pass it through
Essentially take a matching packet and pass it through
action1 | action2 | action3 etc.
You could do something similar to this with the tc policer and the "continue"
operator but this rather restricts it to just the policer and requires
multiple rules (and lookups, hence quiet inefficient);
operator but this rather restricts it to just the policer and requires
multiple rules (and lookups, hence quiet inefficient);
as an example -- and please note that this is just an example _not_ The
as an example -- and please note that this is just an example _not_ The
Word Youve Been Waiting For (yes i have had problems giving examples
which ended becoming dogma in documents and people modifying them a little
to look clever);
to look clever);
i selected the metering rates to be small so that i can show better how
i selected the metering rates to be small so that i can show better how
things work.
The script below does the following:
- an incoming packet from 10.0.0.21 is first given a firewall mark of 1.
The script below does the following:
- an incoming packet from 10.0.0.21 is first given a firewall mark of 1.
- It is then metered to make sure it does not exceed its allocated rate of
1Kbps. If it doesnt exceed rate, this is where we terminate action execution.
- It is then metered to make sure it does not exceed its allocated rate of
1Kbps. If it doesn't exceed rate, this is where we terminate action execution.
- If it does exceed its rate, its "color" changes to a mark of 2 and it is
- If it does exceed its rate, its "color" changes to a mark of 2 and it is
then passed through a second meter.
-The second meter is shared across all flows on that device [i am surpised
that this seems to be not a well know feature of the policer; Bert was telling
-The second meter is shared across all flows on that device [i am suprised
that this seems to be not a well know feature of the policer; Bert was telling
me that someone was writing a qdisc just to do sharing across multiple devices;
it must be the summer heat again; weve had someone doing that every year around
summer -- the key to sharing is to use a operator "index" in your policer
rules (example "index 20"). All your rules have to use the same index to
summer -- the key to sharing is to use a operator "index" in your policer
rules (example "index 20"). All your rules have to use the same index to
share.]
-If the second meter is exceeded the color of the flow changes further to 3.
-We then pass the packet to another meter which is shared across all devices
in the system. If this meter is exceeded we drop the packet.
Note the mark can be used further up the system to do things like policy
Note the mark can be used further up the system to do things like policy
or more interesting things on the egress.
------------------ cut here -------------------------------
@ -145,7 +145,7 @@ u32 match ip src 10.0.0.21/32 flowid 1:15 \
action ipt -j mark --set-mark 1 index 2 \
#
# then pass it through a policer which allows 1kbps; if the flow
# doesn't exceed that rate, this is where we stop, if it exceeds we
# doesnt exceed that rate, this is where we stop, if it exceeds we
# pipe the packet to the next action
action police rate 1kbit burst 9k pipe \
#
@ -161,31 +161,31 @@ action ipt -j mark --set-mark 3 \
# and then attempt to borrow from a meter used by all devices in the
# system. Should this be exceeded, drop the packet on the floor.
action police index 20 mtu 5000 rate 1kbit burst 90k drop
---------------------------------
---------------------------------
Now lets see the actions installed with
Now lets see the actions installed with
"tc filter show parent ffff: dev eth0"
-------- output -----------
jroot# tc filter show parent ffff: dev eth0
filter protocol ip pref 1 u32
filter protocol ip pref 1 u32 fh 800: ht divisor 1
filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:15
filter protocol ip pref 1 u32
filter protocol ip pref 1 u32 fh 800: ht divisor 1
filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:15
action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING
action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING
target MARK set 0x1 index 2
action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb
action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb
action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING
action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING
target MARK set 0x2 index 1
action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b
action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b
action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING
action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING
target MARK set 0x3 index 3
action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b
action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b
match 0a000015/ffffffff at 12
-------------------------------
@ -209,31 +209,31 @@ Now lets take a look at the stats with "tc -s filter show parent ffff: dev eth0"
--------------
jroot# tc -s filter show parent ffff: dev eth0
filter protocol ip pref 1 u32
filter protocol ip pref 1 u32 fh 800: ht divisor 1
filter protocol ip pref 1 u32
filter protocol ip pref 1 u32 fh 800: ht divisor 1
filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1
5
5
action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING
action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING
target MARK set 0x1 index 2
Sent 188832 bytes 2248 pkts (dropped 0, overlimits 0)
Sent 188832 bytes 2248 pkts (dropped 0, overlimits 0)
action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb
Sent 188832 bytes 2248 pkts (dropped 0, overlimits 2122)
action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb
Sent 188832 bytes 2248 pkts (dropped 0, overlimits 2122)
action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING
action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING
target MARK set 0x2 index 1
Sent 178248 bytes 2122 pkts (dropped 0, overlimits 0)
Sent 178248 bytes 2122 pkts (dropped 0, overlimits 0)
action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b
Sent 178248 bytes 2122 pkts (dropped 0, overlimits 1945)
action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b
Sent 178248 bytes 2122 pkts (dropped 0, overlimits 1945)
action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING
action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING
target MARK set 0x3 index 3
Sent 163380 bytes 1945 pkts (dropped 0, overlimits 0)
Sent 163380 bytes 1945 pkts (dropped 0, overlimits 0)
action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b
Sent 163380 bytes 1945 pkts (dropped 0, overlimits 437)
action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b
Sent 163380 bytes 1945 pkts (dropped 0, overlimits 437)
match 0a000015/ffffffff at 12
-------------------------------
@ -241,7 +241,7 @@ filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1
Neat, eh?
Want to write an action module?
Wanna write an action module?
------------------------------
Its easy. Either look at the code or send me email. I will document at
some point; will also accept documentation.
@ -254,3 +254,4 @@ At the moment the focus has been on getting the architecture in place.
Expect new things in the spurious time i have to work on this
(particularly around end of year when i have typically get time off
from work).

View File

@ -1,16 +1,16 @@
gact <ACTION> [RAND] [INDEX]
Where:
ACTION := reclassify | drop | continue | pass | ok
Where:
ACTION := reclassify | drop | continue | pass | ok
RAND := random <RANDTYPE> <ACTION> <VAL>
RANDTYPE := netrand | determ
VAL : = value not exceeding 10000
INDEX := index value used
ACTION semantics
- pass and ok are equivalent to accept
- continue allows one to restart classification lookup
- continue allows to restart classification lookup
- drop drops packets
- reclassify implies continue classification where we left off
@ -42,14 +42,14 @@ filter u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:16 (rule hit 32 suc
random type none pass val 0
index 1 ref 1 bind 1 installed 59 sec used 35 sec
Sent 1680 bytes 20 pkts (dropped 20, overlimits 0 )
----
# example 2
#allow 1 out 10 randomly using the netrand generator
tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \
10.0.0.9/32 flowid 1:16 action drop random netrand ok 10
ping -c 20 10.0.0.9
----
@ -59,14 +59,14 @@ filter protocol ip pref 6 u32 filter protocol ip pref 6 u32 fh 800: ht divisor 1
random type netrand pass val 10
index 5 ref 1 bind 1 installed 49 sec used 25 sec
Sent 1680 bytes 20 pkts (dropped 16, overlimits 0 )
--------
#alternative: deterministically accept every second packet
tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \
10.0.0.9/32 flowid 1:16 action drop random determ ok 2
ping -c 20 10.0.0.9
tc -s filter show parent ffff: dev eth0
-----
filter protocol ip pref 6 u32 filter protocol ip pref 6 u32 fh 800: ht divisor 1filter protocol ip pref 6 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:16 (rule hit 20 success 20)
@ -76,3 +76,4 @@ filter protocol ip pref 6 u32 filter protocol ip pref 6 u32 fh 800: ht divisor 1
index 4 ref 1 bind 1 installed 118 sec used 82 sec
Sent 1680 bytes 20 pkts (dropped 10, overlimits 0 )
-----

View File

@ -6,47 +6,47 @@ with a _lot_ less code.
Known IMQ/IFB USES
------------------
As far as i know the reasons listed below is why people use IMQ.
As far as i know the reasons listed below is why people use IMQ.
It would be nice to know of anything else that i missed.
1) qdiscs/policies that are per device as opposed to system wide.
IFB allows for sharing.
2) Allows for queueing incoming traffic for shaping instead of
dropping. I am not aware of any study that shows policing is
dropping. I am not aware of any study that shows policing is
worse than shaping in achieving the end goal of rate control.
I would be interested if anyone is experimenting.
3) Very interesting use: if you are serving p2p you may want to give
preference to your own locally originated traffic (when responses come back)
3) Very interesting use: if you are serving p2p you may wanna give
preference to your own localy originated traffic (when responses come back)
vs someone using your system to do bittorent. So QoSing based on state
comes in as the solution. What people did to achieve this was stick
comes in as the solution. What people did to achive this was stick
the IMQ somewhere prelocal hook.
I think this is a pretty neat feature to have in Linux in general.
(i.e not just for IMQ).
But i won't go back to putting netfilter hooks in the device to satisfy
this. I also don't think its worth it hacking ifb some more to be
But i wont go back to putting netfilter hooks in the device to satisfy
this. I also dont think its worth it hacking ifb some more to be
aware of say L3 info and play ip rule tricks to achieve this.
--> Instead the plan is to have a conntrack related action. This action will
selectively either query/create conntrack state on incoming packets.
Packets could then be redirected to ifb based on what happens -> eg
on incoming packets; if we find they are of known state we could send to
a different queue than one which didn't have existing state. This
--> Instead the plan is to have a contrack related action. This action will
selectively either query/create contrack state on incoming packets.
Packets could then be redirected to ifb based on what happens -> eg
on incoming packets; if we find they are of known state we could send to
a different queue than one which didnt have existing state. This
all however is dependent on whatever rules the admin enters.
At the moment this 3rd function does not exist yet. I have decided that
instead of sitting on the patch for another year, to release it and then
if there is pressure i will add this feature.
instead of sitting on the patch for another year, to release it and then
if theres pressure i will add this feature.
An example, to provide functionality that most people use IMQ for below:
--------
export TC="/sbin/tc"
$TC qdisc add dev ifb0 root handle 1: prio
$TC qdisc add dev ifb0 root handle 1: prio
$TC qdisc add dev ifb0 parent 1:1 handle 10: sfq
$TC qdisc add dev ifb0 parent 1:2 handle 20: tbf rate 20kbit buffer 1600 limit 3000
$TC qdisc add dev ifb0 parent 1:3 handle 30: sfq
$TC qdisc add dev ifb0 parent 1:3 handle 30: sfq
$TC filter add dev ifb0 protocol ip pref 1 parent 1: handle 1 fw classid 1:1
$TC filter add dev ifb0 protocol ip pref 2 parent 1: handle 2 fw classid 1:2
@ -54,7 +54,7 @@ ifconfig ifb0 up
$TC qdisc add dev eth0 ingress
# redirect all IP packets arriving in eth0 to ifb0
# redirect all IP packets arriving in eth0 to ifb0
# use mark 1 --> puts them onto class 1:1
$TC filter add dev eth0 parent ffff: protocol ip prio 10 u32 \
match u32 0 0 flowid 1:1 \
@ -77,44 +77,44 @@ PING 10.22 (10.0.0.22): 56 data bytes
--- 10.22 ping statistics ---
3 packets transmitted, 3 packets received, 0% packet loss
round-trip min/avg/max = 0.6/1.3/2.8 ms
[root@jzny action-tests]#
[root@jzny action-tests]#
-----
Now look at some stats:
---
[root@jmandrake]:~# $TC -s filter show parent ffff: dev eth0
filter protocol ip pref 10 u32
filter protocol ip pref 10 u32 fh 800: ht divisor 1
filter protocol ip pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1
filter protocol ip pref 10 u32
filter protocol ip pref 10 u32 fh 800: ht divisor 1
filter protocol ip pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1
match 00000000/00000000 at 0
action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING
target MARK set 0x1
index 1 ref 1 bind 1 installed 4195sec used 27sec
Sent 252 bytes 3 pkts (dropped 0, overlimits 0)
action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING
target MARK set 0x1
index 1 ref 1 bind 1 installed 4195sec used 27sec
Sent 252 bytes 3 pkts (dropped 0, overlimits 0)
action order 2: mirred (Egress Redirect to device ifb0) stolen
index 1 ref 1 bind 1 installed 165 sec used 27 sec
Sent 252 bytes 3 pkts (dropped 0, overlimits 0)
Sent 252 bytes 3 pkts (dropped 0, overlimits 0)
[root@jmandrake]:~# $TC -s qdisc
qdisc sfq 30: dev ifb0 limit 128p quantum 1514b
Sent 0 bytes 0 pkts (dropped 0, overlimits 0)
qdisc tbf 20: dev ifb0 rate 20Kbit burst 1575b lat 2147.5s
Sent 210 bytes 3 pkts (dropped 0, overlimits 0)
qdisc sfq 10: dev ifb0 limit 128p quantum 1514b
Sent 294 bytes 3 pkts (dropped 0, overlimits 0)
qdisc sfq 30: dev ifb0 limit 128p quantum 1514b
Sent 0 bytes 0 pkts (dropped 0, overlimits 0)
qdisc tbf 20: dev ifb0 rate 20Kbit burst 1575b lat 2147.5s
Sent 210 bytes 3 pkts (dropped 0, overlimits 0)
qdisc sfq 10: dev ifb0 limit 128p quantum 1514b
Sent 294 bytes 3 pkts (dropped 0, overlimits 0)
qdisc prio 1: dev ifb0 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
Sent 504 bytes 6 pkts (dropped 0, overlimits 0)
qdisc ingress ffff: dev eth0 ----------------
Sent 308 bytes 5 pkts (dropped 0, overlimits 0)
Sent 504 bytes 6 pkts (dropped 0, overlimits 0)
qdisc ingress ffff: dev eth0 ----------------
Sent 308 bytes 5 pkts (dropped 0, overlimits 0)
[root@jmandrake]:~# ifconfig ifb0
ifb0 Link encap:Ethernet HWaddr 00:00:00:00:00:00
ifb0 Link encap:Ethernet HWaddr 00:00:00:00:00:00
inet6 addr: fe80::200:ff:fe00:0/64 Scope:Link
UP BROADCAST RUNNING NOARP MTU:1500 Metric:1
RX packets:6 errors:0 dropped:3 overruns:0 frame:0
TX packets:3 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:32
collisions:0 txqueuelen:32
RX bytes:504 (504.0 b) TX bytes:252 (252.0 b)
-----

View File

@ -7,10 +7,10 @@ flow to be mirrored. High end switches typically can select based
on more than just a port (eg a 5 tuple classifier). They may also be
capable of redirecting.
Usage:
Usage:
mirred <DIRECTION> <ACTION> [index INDEX] <dev DEVICENAME>
where:
mirred <DIRECTION> <ACTION> [index INDEX] <dev DEVICENAME>
where:
DIRECTION := <ingress | egress>
ACTION := <mirror | redirect>
INDEX is the specific policy instance id
@ -18,7 +18,7 @@ DEVICENAME is the devicename
Direction:
- Ingress is not supported at the moment. It will be in the
future as well as mirror/redirecting to a socket.
future as well as mirror/redirecting to a socket.
Action:
- Mirror takes a copy of the packet and sends it to specified
@ -26,17 +26,17 @@ dev ("port" in ethernet switch/bridging terminology)
- redirect
steals the packet and redirects to specified destination dev.
What NOT to do if you don't want your machine to crash:
What NOT to do if you dont want your machine to crash:
------------------------------------------------------
Do not create loops!
Do not create loops!
Loops are not hard to create in the egress qdiscs.
Here are simple rules to follow if you don't want to get
Here are simple rules to follow if you dont want to get
hurt:
A) Do not have the same packet go to same netdevice twice
in a single graph of policies. Your machine will just hang!
This is design intent _not a bug_ to teach you some lessons.
This is design intent _not a bug_ to teach you some lessons.
In the future if there are easy ways to do this in the kernel
without affecting other packets not interested in this feature
@ -51,7 +51,7 @@ B) Do not redirect from one IFB device to another.
Remember that IFB is a very specialized case of packet redirecting
device. Instead of redirecting it puts packets at the exact spot
on the stack it found them from.
Redirecting from ifbX->ifbY will actually not crash your machine but your
Redirecting from ifbX->ifbY will actually not crash your machine but your
packets will all be dropped (this is much simpler to detect
and resolve and is only affecting users of ifb as opposed to the
whole stack).
@ -64,7 +64,7 @@ Some examples:
1) Mirror all packets arriving on eth0 to be sent out on eth1.
You may have a sniffer or some accounting box hooked up on eth1.
---
tc qdisc add dev eth0 ingress
tc filter add dev eth0 parent ffff: protocol ip prio 10 u32 \
@ -100,7 +100,7 @@ stack (i.e ping would work).
3) Even more funky example:
#
#allow 1 out 10 packets on ingress of lo to randomly make it to the
#allow 1 out 10 packets on ingress of lo to randomly make it to the
# host A (Randomness uses the netrand generator)
#
---
@ -111,9 +111,9 @@ action mirred egress mirror dev eth0
---
4)
# for packets from 10.0.0.9 going out on eth0 (could be local
# IP or something # we are forwarding) -
# if exceeding a 100Kbps rate, then redirect to eth1
# for packets from 10.0.0.9 going out on eth0 (could be local
# IP or something # we are forwarding) -
# if exceeding a 100Kbps rate, then redirect to eth1
#
---
@ -129,7 +129,7 @@ so you could tcpdump them (dummy by defaults drops all packets it sees).
This is a very useful debug feature.
Lets say you are policing packets from alias 192.168.200.200/32
you don't want those to exceed 100kbps going out.
you dont want those to exceed 100kbps going out.
---
tc qdisc add dev eth0 handle 1:0 root prio
@ -158,7 +158,7 @@ Essentially a good debugging/logging interface (sort of like
BSDs speacialized log device does without needing one).
If you replace mirror with redirect, those packets will be
blackholed and will never make it out.
blackholed and will never make it out.
cheers,
jamal

429
doc/api-ip6-flowlabels.tex Normal file
View File

@ -0,0 +1,429 @@
\documentstyle[12pt,twoside]{article}
\def\TITLE{IPv6 Flow Labels}
\input preamble
\begin{center}
\Large\bf IPv6 Flow Labels in Linux-2.2.
\end{center}
\begin{center}
{ \large Alexey~N.~Kuznetsov } \\
\em Institute for Nuclear Research, Moscow \\
\verb|kuznet@ms2.inr.ac.ru| \\
\rm April 11, 1999
\end{center}
\vspace{5mm}
\tableofcontents
\section{Introduction.}
Every IPv6 packet carries 28 bits of flow information. RFC2460 splits
these bits to two fields: 8 bits of traffic class (or DS field, if you
prefer this term) and 20 bits of flow label. Currently there exist
no well-defined API to manage IPv6 flow information. In this document
I describe an attempt to design the API for Linux-2.2 IPv6 stack.
\vskip 1mm
The API must solve the following tasks:
\begin{enumerate}
\item To allow user to set traffic class bits.
\item To allow user to read traffic class bits of received packets.
This feature is not so useful as the first one, however it will be
necessary f.e.\ to implement ECN [RFC2481] for datagram oriented services
or to implement receiver side of SRP or another end-to-end protocol
using traffic class bits.
\item To assign flow labels to packets sent by user.
\item To get flow labels of received packets. I do not know
any applications of this feature, but it is possible that receiver will
want to use flow labels to distinguish sub-flows.
\item To allocate flow labels in the way, compliant to RFC2460. Namely:
\begin{itemize}
\item
Flow labels must be uniformly distributed (pseudo-)random numbers,
so that any subset of 20 bits can be used as hash key.
\item
Flows with coinciding source address and flow label must have identical
destination address and not-fragmentable extensions headers (i.e.\
hop by hop options and all the headers up to and including routing header,
if it is present.)
\begin{NB}
There is a hole in specs: some hop-by-hop options can be
defined only on per-packet base (f.e.\ jumbo payload option).
Essentially, it means that such options cannot present in packets
with flow labels.
\end{NB}
\begin{NB}
NB notes here and below reflect only my personal opinion,
they should be read with smile or should not be read at all :-).
\end{NB}
\item
Flow labels have finite lifetime and source is not allowed to reuse
flow label for another flow within the maximal lifetime has expired,
so that intermediate nodes will be able to invalidate flow state before
the label is taken over by another flow.
Flow state, including lifetime, is propagated along datagram path
by some application specific methods
(f.e.\ in RSVP PATH messages or in some hop-by-hop option).
\end{itemize}
\end{enumerate}
\section{Sending/receiving flow information.}
\paragraph{Discussion.}
\addcontentsline{toc}{subsection}{Discussion}
It was proposed (Where? I do not remember any explicit statement)
to solve the first four tasks using
\verb|sin6_flowinfo| field added to \verb|struct| \verb|sockaddr_in6|
(see RFC2553).
\begin{NB}
This method is difficult to consider as reasonable, because it
puts additional overhead to all the services, despite of only
very small subset of them (none, to be more exact) really use it.
It contradicts both to IETF spirit and the letter. Before RFC2553
one justification existed, IPv6 address alignment left 4 byte
hole in \verb|sockaddr_in6| in any case. Now it has no justification.
\end{NB}
We have two problems with this method. The first one is common for all OSes:
if \verb|recvmsg()| initializes \verb|sin6_flowinfo| to flow info
of received packet, we loose one very important property of BSD socket API,
namely, we are not allowed to use received address for reply directly
and have to mangle it, even if we are not interested in flowinfo subtleties.
\begin{NB}
RFC2553 adds new requirement: to clear \verb|sin6_flowinfo|.
Certainly, it is not solution but rather attempt to force applications
to make unnecessary work. Well, as usually, one mistake in design
is followed by attempts to patch the hole and more mistakes...
\end{NB}
Another problem is Linux specific. Historically Linux IPv6 did not
initialize \verb|sin6_flowinfo| at all, so that, if kernel does not
support flow labels, this field is not zero, but a random number.
Some applications also did not take care about it.
\begin{NB}
Following RFC2553 such applications can be considered as broken,
but I still think that they are right: clearing all the address
before filling known fields is robust but stupid solution.
Useless wasting CPU cycles and
memory bandwidth is not a good idea. Such patches are acceptable
as temporary hacks, but not as standard of the future.
\end{NB}
\paragraph{Implementation.}
\addcontentsline{toc}{subsection}{Implementation}
By default Linux IPv6 does not read \verb|sin6_flowinfo| field
assuming that common applications are not obliged to initialize it
and are permitted to consider it as pure alignment padding.
In order to tell kernel that application
is aware of this field, it is necessary to set socket option
\verb|IPV6_FLOWINFO_SEND|.
\begin{verbatim}
int on = 1;
setsockopt(sock, SOL_IPV6, IPV6_FLOWINFO_SEND,
(void*)&on, sizeof(on));
\end{verbatim}
Linux kernel never fills \verb|sin6_flowinfo| field, when passing
message to user space, though the kernels which support flow labels
initialize it to zero. If user wants to get received flowinfo, he
will set option \verb|IPV6_FLOWINFO| and after this he will receive
flowinfo as ancillary data object of type \verb|IPV6_FLOWINFO|
(cf.\ RFC2292).
\begin{verbatim}
int on = 1;
setsockopt(sock, SOL_IPV6, IPV6_FLOWINFO, (void*)&on, sizeof(on));
\end{verbatim}
Flowinfo received and latched by a connected TCP socket also may be fetched
with \verb|getsockopt()| \verb|IPV6_PKTOPTIONS| together with
another optional information.
Besides that, in the spirit of RFC2292 the option \verb|IPV6_FLOWINFO|
may be used as alternative way to send flowinfo with \verb|sendmsg()| or
to latch it with \verb|IPV6_PKTOPTIONS|.
\paragraph{Note about IPv6 options and destination address.}
\addcontentsline{toc}{subsection}{IPv6 options and destination address}
If \verb|sin6_flowinfo| does contain not zero flow label,
destination address in \verb|sin6_addr| and non-fragmentable
extension headers are ignored. Instead, kernel uses the values
cached at flow setup (see below). However, for connected sockets
kernel prefers the values set at connection time.
\paragraph{Example.}
\addcontentsline{toc}{subsection}{Example}
After setting socket option \verb|IPV6_FLOWINFO|
flowlabel and DS field are received as ancillary data object
of type \verb|IPV6_FLOWINFO| and level \verb|SOL_IPV6|.
In the cases when it is convenient to use \verb|recvfrom(2)|,
it is possible to replace library variant with your own one,
sort of:
\begin{verbatim}
#include <sys/socket.h>
#include <netinet/in6.h>
size_t recvfrom(int fd, char *buf, size_t len, int flags,
struct sockaddr *addr, int *addrlen)
{
size_t cc;
char cbuf[128];
struct cmsghdr *c;
struct iovec iov = { buf, len };
struct msghdr msg = { addr, *addrlen,
&iov, 1,
cbuf, sizeof(cbuf),
0 };
cc = recvmsg(fd, &msg, flags);
if (cc < 0)
return cc;
((struct sockaddr_in6*)addr)->sin6_flowinfo = 0;
*addrlen = msg.msg_namelen;
for (c=CMSG_FIRSTHDR(&msg); c; c = CMSG_NEXTHDR(&msg, c)) {
if (c->cmsg_level != SOL_IPV6 ||
c->cmsg_type != IPV6_FLOWINFO)
continue;
((struct sockaddr_in6*)addr)->sin6_flowinfo = *(__u32*)CMSG_DATA(c);
}
return cc;
}
\end{verbatim}
\section{Flow label management.}
\paragraph{Discussion.}
\addcontentsline{toc}{subsection}{Discussion}
Requirements of RFC2460 are pretty tough. Particularly, lifetimes
longer than boot time require to store allocated labels at stable
storage, so that the full implementation necessarily includes user space flow
label manager. There are at least three different approaches:
\begin{enumerate}
\item {\bf ``Cooperative''. } We could leave flow label allocation wholly
to user space. When user needs label he requests manager directly. The approach
is valid, but as any ``cooperative'' approach it suffers of security problems.
\begin{NB}
One idea is to disallow not privileged user to allocate flow
labels, but instead to pass the socket to manager via \verb|SCM_RIGHTS|
control message, so that it will allocate label and assign it to socket
itself. Hmm... the idea is interesting.
\end{NB}
\item {\bf ``Indirect''.} Kernel redirects requests to user level daemon
and does not install label until the daemon acknowledged the request.
The approach is the most promising, it is especially pleasant to recognize
parallel with IPsec API [RFC2367,Craig]. Actually, it may share API with
IPsec.
\item {\bf ``Stupid''.} To allocate labels in kernel space. It is the simplest
method, but it suffers of two serious flaws: the first,
we cannot lease labels with lifetimes longer than boot time, the second,
it is sensitive to DoS attacks. Kernel have to remember all the obsolete
labels until their expiration and malicious user may fastly eat all the
flow label space.
\end{enumerate}
Certainly, I choose the most ``stupid'' method. It is the cheapest one
for implementor (i.e.\ me), and taking into account that flow labels
still have no serious applications it is not useful to work on more
advanced API, especially, taking into account that eventually we
will get it for no fee together with IPsec.
\paragraph{Implementation.}
\addcontentsline{toc}{subsection}{Implementation}
Socket option \verb|IPV6_FLOWLABEL_MGR| allows to
request flow label manager to allocate new flow label, to reuse
already allocated one or to delete old flow label.
Its argument is \verb|struct| \verb|in6_flowlabel_req|:
\begin{verbatim}
struct in6_flowlabel_req
{
struct in6_addr flr_dst;
__u32 flr_label;
__u8 flr_action;
__u8 flr_share;
__u16 flr_flags;
__u16 flr_expires;
__u16 flr_linger;
__u32 __flr_reserved;
/* Options in format of IPV6_PKTOPTIONS */
};
\end{verbatim}
\begin{itemize}
\item \verb|dst| is IPv6 destination address associated with the label.
\item \verb|label| is flow label value in network byte order. If it is zero,
kernel will allocate new pseudo-random number. Otherwise, kernel will try
to lease flow label ordered by user. In this case, it is user task to provide
necessary flow label randomness.
\item \verb|action| is requested operation. Currently, only three operations
are defined:
\begin{verbatim}
#define IPV6_FL_A_GET 0 /* Get flow label */
#define IPV6_FL_A_PUT 1 /* Release flow label */
#define IPV6_FL_A_RENEW 2 /* Update expire time */
\end{verbatim}
\item \verb|flags| are optional modifiers. Currently
only \verb|IPV6_FL_A_GET| has modifiers:
\begin{verbatim}
#define IPV6_FL_F_CREATE 1 /* Allowed to create new label */
#define IPV6_FL_F_EXCL 2 /* Do not create new label */
\end{verbatim}
\item \verb|share| defines who is allowed to reuse the same flow label.
\begin{verbatim}
#define IPV6_FL_S_NONE 0 /* Not defined */
#define IPV6_FL_S_EXCL 1 /* Label is private */
#define IPV6_FL_S_PROCESS 2 /* May be reused by this process */
#define IPV6_FL_S_USER 3 /* May be reused by this user */
#define IPV6_FL_S_ANY 255 /* Anyone may reuse it */
\end{verbatim}
\item \verb|linger| is time in seconds. After the last user releases flow
label, it will not be reused with different destination and options at least
during this time. If \verb|share| is not \verb|IPV6_FL_S_EXCL| the label
still can be shared by another sockets. Current implementation does not allow
unprivileged user to set linger longer than 60 sec.
\item \verb|expires| is time in seconds. Flow label will be kept at least
for this time, but it will not be destroyed before user released it explicitly
or closed all the sockets using it. Current implementation does not allow
unprivileged user to set timeout longer than 60 sec. Proviledged applications
MAY set longer lifetimes, but in this case they MUST save allocated
labels at stable storage and restore them back after reboot before the first
application allocates new flow.
\end{itemize}
This structure is followed by optional extension headers associated
with this flow label in format of \verb|IPV6_PKTOPTIONS|. Only
\verb|IPV6_HOPOPTS|, \verb|IPV6_RTHDR| and, if \verb|IPV6_RTHDR| presents,
\verb|IPV6_DSTOPTS| are allowed.
\paragraph{Example.}
\addcontentsline{toc}{subsection}{Example}
The function \verb|get_flow_label| allocates
private flow label.
\begin{verbatim}
int get_flow_label(int fd, struct sockaddr_in6 *dst, __u32 fl)
{
int on = 1;
struct in6_flowlabel_req freq;
memset(&freq, 0, sizeof(freq));
freq.flr_label = htonl(fl);
freq.flr_action = IPV6_FL_A_GET;
freq.flr_flags = IPV6_FL_F_CREATE | IPV6_FL_F_EXCL;
freq.flr_share = IPV6_FL_S_EXCL;
memcpy(&freq.flr_dst, &dst->sin6_addr, 16);
if (setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR,
&freq, sizeof(freq)) == -1) {
perror ("can't lease flowlabel");
return -1;
}
dst->sin6_flowinfo |= freq.flr_label;
if (setsockopt(fd, SOL_IPV6, IPV6_FLOWINFO_SEND,
&on, sizeof(on)) == -1) {
perror ("can't send flowinfo");
freq.flr_action = IPV6_FL_A_PUT;
setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR,
&freq, sizeof(freq));
return -1;
}
return 0;
}
\end{verbatim}
A bit more complicated example using routing header can be found
in \verb|ping6| utility (\verb|iputils| package). Linux rsvpd backend
contains an example of using operation \verb|IPV6_FL_A_RENEW|.
\paragraph{Listing flow labels.}
\addcontentsline{toc}{subsection}{Listing flow labels}
List of currently allocated
flow labels may be read from \verb|/proc/net/ip6_flowlabel|.
\begin{verbatim}
Label S Owner Users Linger Expires Dst Opt
A1BE5 1 0 0 6 3 3ffe2400000000010a0020fffe71fb30 0
\end{verbatim}
\begin{itemize}
\item \verb|Label| is hexadecimal flow label value.
\item \verb|S| is sharing style.
\item \verb|Owner| is ID of creator, it is zero, pid or uid, depending on
sharing style.
\item \verb|Users| is number of applications using the label now.
\item \verb|Linger| is \verb|linger| of this label in seconds.
\item \verb|Expires| is time until expiration of the label in seconds. It may
be negative, if the label is in use.
\item \verb|Dst| is IPv6 destination address.
\item \verb|Opt| is length of options, associated with the label. Option
data are not accessible.
\end{itemize}
\paragraph{Flow labels and RSVP.}
\addcontentsline{toc}{subsection}{Flow labels and RSVP}
RSVP daemon supports IPv6 flow labels
without any modifications to standard ISI RAPI. Sender must allocate
flow label, fill corresponding sender template and submit it to local rsvp
daemon. rsvpd will check the label and start to announce it in PATH
messages. Rsvpd on sender node will renew the flow label, so that it will not
be reused before path state expires and all the intermediate
routers and receiver purge flow state.
\verb|rtap| utility is modified to parse flow labels. F.e.\ if user allocated
flow label \verb|0xA1234|, he may write:
\begin{verbatim}
RTAP> sender 3ffe:2400::1/FL0xA1234 <Tspec>
\end{verbatim}
Receiver makes reservation with command:
\begin{verbatim}
RTAP> reserve ff 3ffe:2400::1/FL0xA1234 <Flowspec>
\end{verbatim}
\end{document}

130
doc/arpd.sgml Normal file
View File

@ -0,0 +1,130 @@
<!doctype linuxdoc system>
<article>
<title>ARPD Daemon
<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/
<date>some_negative_number, 20 Sep 2001
<abstract>
<tt/arpd/ is daemon collecting gratuitous ARP information, saving
it on local disk and feeding it to kernel on demand to avoid
redundant broadcasting due to limited size of kernel ARP cache.
</abstract>
<p><bf/Description/
<p>The format of the command is:
<tscreen><verb>
arpd OPTIONS [ INTERFACE [ INTERFACE ... ] ]
</verb></tscreen>
<p> <tt/OPTIONS/ are:
<itemize>
<item><tt/-l/ - dump <tt/arpd/ database to stdout and exit. Output consists
of three columns: interface index, IP address and MAC address.
Negative entries for dead hosts are also shown, in this case MAC address
is replaced by word <tt/FAILED/ followed by colon and time when the fact
that host is dead was proven the last time.
<item><tt/-f FILE/ - read and load <tt/arpd/ database from <tt/FILE/
in text format similar dumped by option <tt/-l/. Exit after load,
probably listing resulting database, if option <tt/-l/ is also given.
If <tt/FILE/ is <tt/-/, <tt/stdin/ is read to get ARP table.
<item><tt/-b DATABASE/ - location of database file. Default location is
<tt>/var/lib/arpd/arpd.db</tt>.
<item><tt/-a NUMBER/ - <tt/arpd/ not only passively listens ARP on wire, but
also send brodcast queries itself. <tt/NUMBER/ is number of such queries
to make before destination is considered as dead. When <tt/arpd/ is started
as kernel helper (i.e. with <tt/app_solicit/ enabled in <tt/sysctl/
or even with option <tt/-k/) without this option and still did not learn enough
information, you can observe 1 second gaps in service. Not fatal, but
not good.
<item><tt/-k/ - suppress sending broadcast queries by kernel. It takes
sense together with option <tt/-a/.
<item><tt/-n TIME/ - timeout of negative cache. When resolution fails <tt/arpd/
suppresses further attempts to resolve for this period. It makes sense
only together with option <tt/-k/. This timeout should not be too much
longer than boot time of a typical host not supporting gratuitous ARP.
Default value is 60 seconds.
<item><tt/-R RATE/ - maximal steady rate of broadcasts sent by <tt/arpd/
in packets per second. Default value is 1.
<item><tt/-B NUMBER/ - number of broadcasts sent by <tt/arpd/ back to back.
Default value is 3. Together with option <tt/-R/ this option allows
to police broadcasting not to exceed <tt/B+R*T/ over any interval
of time <tt/T/.
</itemize>
<p><tt/INTERFACE/ is name of networking inteface to watch.
If no interfaces given, <tt/arpd/ monitors all the interfaces.
In this case <tt/arpd/ does not adjust <tt/sysctl/ parameters,
it is supposed user does this himself after <tt/arpd/ is started.
<p> Signals
<p> <tt/arpd/ exits gracefully syncing database and restoring adjusted
<tt/sysctl/ parameters, when receives <tt/SIGINT/ or <tt/SIGTERM/.
<tt/SIGHUP/ syncs database to disk. <tt/SIGUSR1/ sends some statistics
to <tt/syslog/. Effect of another signals is undefined, they may corrupt
database and leave <tt/sysctl/ parameters in an unpredictable state.
<p> Note
<p> In order to <tt/arpd/ be able to serve as ARP resolver, kernel must be
compiled with the option <tt/CONFIG_ARPD/ and, in the case when interface list
is not given on command line, variable <tt/app_solicit/
on interfaces of interest should be set in <tt>/proc/sys/net/ipv4/neigh/*</tt>.
If this is not made <tt/arpd/ still collects gratuitous ARP information
in its database.
<p> Examples
<enum>
<item> Start <tt/arpd/ to collect gratuitous ARP, but not messing
with kernel functionality:
<tscreen><verb>
arpd -b /var/tmp/arpd.db
</verb></tscreen>
<item> Look at result after some time:
<tscreen><verb>
killall arpd
arpd -l -b /var/tmp/arpd.db
</verb></tscreen>
<item> To enable kernel helper, leaving leading role to kernel:
<tscreen><verb>
arpd -b /var/tmp/arpd.db -a 1 eth0 eth1
</verb></tscreen>
<item> Completely replace kernel resolution on interfaces <tt/eth0/
and <tt/eth1/. In this case kernel still does unicast probing to
validate entries, but all the broadcast activity is suppressed
and made under authority of <tt/arpd/:
<tscreen><verb>
arpd -b /var/tmp/arpd.db -a 3 -k eth0 eth1
</verb></tscreen>
This is mode which <tt/arpd/ is supposed to work normally.
It is not default just to prevent occasional enabling of too aggressive
mode occasionally.
</enum>
</article>

16
doc/do-psnup Normal file
View File

@ -0,0 +1,16 @@
#! /bin/bash
# $1 = Temporary file . "string"
# $2 = File to process . "string"
# $3 = Page size . ie: a4 , letter ... "string"
# $4 = Number of pages to fit on a single sheet . "numeric"
if type psnup >&/dev/null; then
echo "psnup -$4 -p$3 $1 $2"
psnup -$4 -p$3 $1 $2
elif type psmulti >&/dev/null; then
echo "psmulti $1 > $2"
psmulti $1 > $2
else
echo "cp $1 $2"
cp $1 $2
fi

3413
doc/ip-cref.tex Normal file

File diff suppressed because it is too large Load Diff

469
doc/ip-tunnels.tex Normal file
View File

@ -0,0 +1,469 @@
\documentstyle[12pt,twoside]{article}
\def\TITLE{Tunnels over IP}
\input preamble
\begin{center}
\Large\bf Tunnels over IP in Linux-2.2
\end{center}
\begin{center}
{ \large Alexey~N.~Kuznetsov } \\
\em Institute for Nuclear Research, Moscow \\
\verb|kuznet@ms2.inr.ac.ru| \\
\rm March 17, 1999
\end{center}
\vspace{5mm}
\tableofcontents
\section{Instead of introduction: micro-FAQ.}
\begin{itemize}
\item
Q: In linux-2.0.36 I used:
\begin{verbatim}
ifconfig tunl1 10.0.0.1 pointopoint 193.233.7.65
\end{verbatim}
to create tunnel. It does not work in 2.2.0!
A: You are right, it does not work. The command written above is split to two commands.
\begin{verbatim}
ip tunnel add MY-TUNNEL mode ipip remote 193.233.7.65
\end{verbatim}
will create tunnel device with name \verb|MY-TUNNEL|. Now you may configure
it with:
\begin{verbatim}
ifconfig MY-TUNNEL 10.0.0.1
\end{verbatim}
Certainly, if you prefer name \verb|tunl1| to \verb|MY-TUNNEL|,
you still may use it.
\item
Q: In linux-2.0.36 I used:
\begin{verbatim}
ifconfig tunl0 10.0.0.1
route add -net 10.0.0.0 gw 193.233.7.65 dev tunl0
\end{verbatim}
to tunnel net 10.0.0.0 via router 193.233.7.65. It does not
work in 2.2.0! Moreover, \verb|route| prints a funny error sort of
``network unreachable'' and after this I found a strange direct route
to 10.0.0.0 via \verb|tunl0| in routing table.
A: Yes, in 2.2 the rule that {\em normal} gateway must reside on directly
connected network has not any exceptions. You may tell kernel, that
this particular route is {\em abnormal}:
\begin{verbatim}
ifconfig tunl0 10.0.0.1 netmask 255.255.255.255
ip route add 10.0.0.0/8 via 193.233.7.65 dev tunl0 onlink
\end{verbatim}
Note keyword \verb|onlink|, it is the magic key that orders kernel
not to check for consistency of gateway address.
Probably, after this explanation you have already guessed another method
to cheat kernel:
\begin{verbatim}
ifconfig tunl0 10.0.0.1 netmask 255.255.255.255
route add -host 193.233.7.65 dev tunl0
route add -net 10.0.0.0 netmask 255.0.0.0 gw 193.233.7.65
route del -host 193.233.7.65 dev tunl0
\end{verbatim}
Well, if you like such tricks, nobody may prohibit you to use them.
Only do not forget
that between \verb|route add| and \verb|route del| host 193.233.7.65 is
unreachable.
\item
Q: In 2.0.36 I used to load \verb|tunnel| device module and \verb|ipip| module.
I cannot find any \verb|tunnel| in 2.2!
A: Linux-2.2 has single module \verb|ipip| for both directions of tunneling
and for all IPIP tunnel devices.
\item
Q: \verb|traceroute| does not work over tunnel! Well, stop... It works,
only skips some number of hops.
A: Yes. By default tunnel driver copies \verb|ttl| value from
inner packet to outer one. It means that path traversed by tunneled
packets to another endpoint is not hidden. If you dislike this, or if you
are going to use some routing protocol expecting that packets
with ttl 1 will reach peering host (f.e.\ RIP, OSPF or EBGP)
and you are not afraid of
tunnel loops, you may append option \verb|ttl 64|, when creating tunnel
with \verb|ip tunnel add|.
\item
Q: ... Well, list of things, which 2.0 was able to do finishes.
\end{itemize}
\paragraph{Summary of differences between 2.2 and 2.0.}
\begin{itemize}
\item {\bf In 2.0} you could compile tunnel device into kernel
and got set of 4 devices \verb|tunl0| ... \verb|tunl3| or,
alternatively, compile it as module and load new module
for each new tunnel. Also, module \verb|ipip| was necessary
to receive tunneled packets.
{\bf 2.2} has {\em one\/} module \verb|ipip|. Loading it you get base
tunnel device \verb|tunl0| and another tunnels may be created with command
\verb|ip tunnel add|. These new devices may have arbitrary names.
\item {\bf In 2.0} you set remote tunnel endpoint address with
the command \verb|ifconfig| ... \verb|pointopoint A|.
{\bf In 2.2} this command has the same semantics on all
the interfaces, namely it sets not tunnel endpoint,
but address of peering host, which is directly reachable
via this tunnel,
rather than via Internet. Actual tunnel endpoint address \verb|A|
should be set with \verb|ip tunnel add ... remote A|.
\item {\bf In 2.0} you create tunnel routes with the command:
\begin{verbatim}
route add -net 10.0.0.0 gw A dev tunl0
\end{verbatim}
{\bf 2.2} interprets this command equally for all device
kinds and gateway is required to be directly reachable via this tunnel,
rather than via Internet. You still may use \verb|ip route add ... onlink|
to override this behaviour.
\end{itemize}
\section{Tunnel setup: basics}
Standard Linux-2.2 kernel supports three flavor of tunnels,
listed in the following table:
\vspace{2mm}
\begin{tabular}{lll}
\vrule depth 0.8ex width 0pt\relax
Mode & Description & Base device \\
ipip & IP over IP & tunl0 \\
sit & IPv6 over IP & sit0 \\
gre & ANY over GRE over IP & gre0
\end{tabular}
\vspace{2mm}
\noindent All the kinds of tunnels are created with one command:
\begin{verbatim}
ip tunnel add <NAME> mode <MODE> [ local <S> ] [ remote <D> ]
\end{verbatim}
This command creates new tunnel device with name \verb|<NAME>|.
The \verb|<NAME>| is an arbitrary string. Particularly,
it may be even \verb|eth0|. The rest of parameters set
different tunnel characteristics.
\begin{itemize}
\item
\verb|mode <MODE>| sets tunnel mode. Three modes are available now
\verb|ipip|, \verb|sit| and \verb|gre|.
\item
\verb|remote <D>| sets remote endpoint of the tunnel to IP
address \verb|<D>|.
\item
\verb|local <S>| sets fixed local address for tunneled
packets. It must be an address on another interface of this host.
\end{itemize}
\let\thefootnote\oldthefootnote
Both \verb|remote| and \verb|local| may be omitted. In this case we
say that they are zero or wildcard. Two tunnels of one mode cannot
have the same \verb|remote| and \verb|local|. Particularly it means
that base device or fallback tunnel cannot be replicated.\footnote{
This restriction is relaxed for keyed GRE tunnels.}
Tunnels are divided to two classes: {\bf pointopoint} tunnels, which
have some not wildcard \verb|remote| address and deliver all the packets
to this destination, and {\bf NBMA} (i.e. Non-Broadcast Multi-Access) tunnels,
which have no \verb|remote|. Particularly, base devices (f.e.\ \verb|tunl0|)
are NBMA, because they have neither \verb|remote| nor
\verb|local| addresses.
After tunnel device is created you should configure it as you did
it with another devices. Certainly, the configuration of tunnels has
some features related to the fact that they work over existing Internet
routing infrastructure and simultaneously create new virtual links,
which changes this infrastructure. The danger that not enough careful
tunnel setup will result in formation of tunnel loops,
collapse of routing or flooding network with exponentially
growing number of tunneled fragments is very real.
Protocol setup on pointopoint tunnels does not differ of configuration
of another devices. You should set a protocol address with \verb|ifconfig|
and add routes with \verb|route| utility.
NBMA tunnels are different. To route something via NBMA tunnel
you have to explain to driver, where it should deliver packets to.
The only way to make it is to create special routes with gateway
address pointing to desired endpoint. F.e.\
\begin{verbatim}
ip route add 10.0.0.0/24 via <A> dev tunl0 onlink
\end{verbatim}
It is important to use option \verb|onlink|, otherwise
kernel will refuse request to create route via gateway not directly
reachable over device \verb|tunl0|. With IPv6 the situation is much simpler:
when you start device \verb|sit0|, it automatically configures itself
with all IPv4 addresses mapped to IPv6 space, so that all IPv4
Internet is {\em really reachable} via \verb|sit0|! Excellent, the command
\begin{verbatim}
ip route add 3FFE::/16 via ::193.233.7.65 dev sit0
\end{verbatim}
will route \verb|3FFE::/16| via \verb|sit0|, sending all the packets
destined to this prefix to 193.233.7.65.
\section{Tunnel setup: options}
Command \verb|ip tunnel add| has several additional options.
\begin{itemize}
\item \verb|ttl N| --- set fixed TTL \verb|N| on tunneled packets.
\verb|N| is number in the range 1--255. 0 is special value,
meaning that packets inherit TTL value.
Default value is: \verb|inherit|.
\item \verb|tos T| --- set fixed tos \verb|T| on tunneled packets.
Default value is: \verb|inherit|.
\item \verb|dev DEV| --- bind tunnel to device \verb|DEV|, so that
tunneled packets will be routed only via this device and will
not be able to escape to another device, when route to endpoint changes.
\item \verb|nopmtudisc| --- disable Path MTU Discovery on this tunnel.
It is enabled by default. Note that fixed ttl is incompatible
with this option: tunnels with fixed ttl always make pmtu discovery.
\end{itemize}
\verb|ipip| and \verb|sit| tunnels have no more options. \verb|gre|
tunnels are more complicated:
\begin{itemize}
\item \verb|key K| --- use keyed GRE with key \verb|K|. \verb|K| is
either number or IP address-like dotted quad.
\item \verb|csum| --- checksum tunneled packets.
\item \verb|seq| --- serialize packets.
\begin{NB}
I think this option does not
work. At least, I did not test it, did not debug it and
even do not understand, how it is supposed to work and for what
purpose Cisco planned to use it.
\end{NB}
\end{itemize}
Actually, these GRE options can be set separately for input and
output directions by prefixing corresponding keywords with letter
\verb|i| or \verb|o|. F.e.\ \verb|icsum| orders to accept only
packets with correct checksum and \verb|ocsum| means, that
our host will calculate and send checksum.
Command \verb|ip tunnel add| is not the only operation,
which can be made with tunnels. Certainly, you may get short help page
with:
\begin{verbatim}
ip tunnel help
\end{verbatim}
Besides that, you may view list of installed tunnels with the help of command:
\begin{verbatim}
ip tunnel ls
\end{verbatim}
Also you may look at statistics:
\begin{verbatim}
ip -s tunnel ls Cisco
\end{verbatim}
where \verb|Cisco| is name of tunnel device. Command
\begin{verbatim}
ip tunnel del Cisco
\end{verbatim}
destroys tunnel \verb|Cisco|. And, finally,
\begin{verbatim}
ip tunnel change Cisco mode sit local ME remote HE ttl 32
\end{verbatim}
changes its parameters.
\section{Differences 2.2 and 2.0 tunnels revisited.}
Now we can discuss more subtle differences between tunneling in 2.0
and 2.2.
\begin{itemize}
\item In 2.0 all tunneled packets were received promiscuously
as soon as you loaded module \verb|ipip|. 2.2 tries to select the best
tunnel device and packet looks as received on this. F.e.\ if host
received \verb|ipip| packet from host \verb|D| destined to our
local address \verb|S|, kernel searches for matching tunnels
in order:
\begin{tabular}{ll}
1 & \verb|remote| is \verb|D| and \verb|local| is \verb|S| \\
2 & \verb|remote| is \verb|D| and \verb|local| is wildcard \\
3 & \verb|remote| is wildcard and \verb|local| is \verb|S| \\
4 & \verb|tunl0|
\end{tabular}
If tunnel exists, but it is not in \verb|UP| state, the tunnel is ignored.
Note, that if \verb|tunl0| is \verb|UP| it receives all the IPIP packets,
not acknowledged by more specific tunnels.
Be careful, it means that without carefully installed firewall rules
anyone on the Internet may inject to your network any packets with
source addresses indistinguishable from local ones. It is not so bad idea
to design tunnels in the way enforcing maximal route symmetry
and to enable reversed path filter (\verb|rp_filter| sysctl option) on
tunnel devices.
\item In 2.2 you can monitor and debug tunnels with \verb|tcpdump|.
F.e.\ \verb|tcpdump| \verb|-i Cisco| \verb|-nvv| will dump packets,
which kernel output, via tunnel \verb|Cisco| and the packets received on it
from kernel viewpoint.
\end{itemize}
\section{Linux and Cisco IOS tunnels.}
Among another tunnels Cisco IOS supports IPIP and GRE.
Essentially, Cisco setup is subset of options, available for Linux.
Let us consider the simplest example:
\begin{verbatim}
interface Tunnel0
tunnel mode gre ip
tunnel source 10.10.14.1
tunnel destination 10.10.13.2
\end{verbatim}
This command set translates to:
\begin{verbatim}
ip tunnel add Tunnel0 \
mode gre \
local 10.10.14.1 \
remote 10.10.13.2
\end{verbatim}
Any questions? No questions.
\section{Interaction IPIP tunnels and DVMRP.}
DVMRP exploits IPIP tunnels to route multicasts via Internet.
\verb|mrouted| creates
IPIP tunnels listed in its configuration file automatically.
From kernel and user viewpoints there are no differences between
tunnels, created in this way, and tunnels created by \verb|ip tunnel|.
I.e.\ if \verb|mrouted| created some tunnel, it may be used to
route unicast packets, provided appropriate routes are added.
And vice versa, if administrator has already created a tunnel,
it will be reused by \verb|mrouted|, if it requests DVMRP
tunnel with the same local and remote addresses.
Do not wonder, if your manually configured tunnel is
destroyed, when mrouted exits.
\section{Broadcast GRE ``tunnels''.}
It is possible to set \verb|remote| for GRE tunnel to a multicast
address. Such tunnel becomes {\bf broadcast} tunnel (though word
tunnel is not quite appropriate in this case, it is rather virtual network).
\begin{verbatim}
ip tunnel add Universe local 193.233.7.65 \
remote 224.66.66.66 ttl 16
ip addr add 10.0.0.1/16 dev Universe
ip link set Universe up
\end{verbatim}
This tunnel is true broadcast network and broadcast packets are
sent to multicast group 224.66.66.66. By default such tunnel starts
to resolve both IP and IPv6 addresses via ARP/NDISC, so that
if multicast routing is supported in surrounding network, all GRE nodes
will find one another automatically and will form virtual Ethernet-like
broadcast network. If multicast routing does not work, it is unpleasant
but not fatal flaw. The tunnel becomes NBMA rather than broadcast network.
You may disable dynamic ARPing by:
\begin{verbatim}
echo 0 > /proc/sys/net/ipv4/neigh/Universe/mcast_solicit
\end{verbatim}
and to add required information to ARP tables manually:
\begin{verbatim}
ip neigh add 10.0.0.2 lladdr 128.6.190.2 dev Universe nud permanent
\end{verbatim}
In this case packets sent to 10.0.0.2 will be encapsulated in GRE
and sent to 128.6.190.2. It is possible to facilitate address resolution
using methods typical for another NBMA networks f.e.\ to start user
level \verb|arpd| daemon, which will maintain database of hosts attached
to GRE virtual network or ask for information
dedicated ARP or NHRP server.
Actually, such setup is the most natural for tunneling,
it is really flexible, scalable and easily managable, so that
it is strongly recommended to be used with GRE tunnels instead of ugly
hack with NBMA mode and \verb|onlink| modifier. Unfortunately,
by historical reasons broadcast mode is not supported by IPIP tunnels,
but this probably will change in future.
\section{Traffic control issues.}
Tunnels are devices, hence all the power of Linux traffic control
applies to them. The simplest (and the most useful in practice)
example is limiting tunnel bandwidth. The following command:
\begin{verbatim}
tc qdisc add dev tunl0 root tbf \
rate 128Kbit burst 4K limit 10K
\end{verbatim}
will limit tunneled traffic to 128Kbit with maximal burst size of 4K
and queuing not more than 10K.
However, you should remember, that tunnels are {\em virtual} devices
implemented in software and true queue management is impossible for them
just because they have no queues. Instead, it is better to create classes
on real physical interfaces and to map tunneled packets to them.
In general case of dynamic routing you should create such classes
on all outgoing interfaces, or, alternatively,
to use option \verb|dev DEV| to bind tunnel to a fixed physical device.
In the last case packets will be routed only via specified device
and you need to setup corresponding classes only on it.
Though you have to pay for this convenience,
if routing will change, your tunnel will fail.
Suppose that CBQ class \verb|1:ABC| has been created on device \verb|eth0|
specially for tunnel \verb|Cisco| with endpoints \verb|S| and \verb|D|.
Now you can select IPIP packets with addresses \verb|S| and \verb|D|
with some classifier and map them to class \verb|1:ABC|. F.e.\
it is easy to make with \verb|rsvp| classifier:
\begin{verbatim}
tc filter add dev eth0 pref 100 proto ip rsvp \
session D ipproto ipip filter S \
classid 1:ABC
\end{verbatim}
If you want to make more detailed classification of sub-flows
transmitted via tunnel, you can build CBQ subtree,
rooted at \verb|1:ABC| and attach to subroot set of rules parsing
IPIP packets more deeply.
\end{document}

110
doc/nstat.sgml Normal file
View File

@ -0,0 +1,110 @@
<!doctype linuxdoc system>
<article>
<title>NSTAT, IFSTAT and RTACCT Utilities
<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/
<date>some_negative_number, 20 Sep 2001
<abstract>
<tt/nstat/, <tt/ifstat/ and <tt/rtacct/ are simple tools helping
to monitor kernel snmp counters and network interface statistics.
</abstract>
<p> These utilities are very similar, so that I describe
them simultaneously, using name <tt/Xstat/ in the places which apply
to all of them.
<p>The format of the command is:
<tscreen><verb>
Xstat [ OPTIONS ] [ PATTERN [ PATTERN ... ] ]
</verb></tscreen>
<p>
<tt/PATTERN/ is shell style pattern, selecting identifier
of SNMP variables or interfaces to show. Variable is displayed
if one of patterns matches its name. If no patterns are given,
<tt/Xstat/ assumes that user wants to see all the variables.
<p> <tt/OPTIONS/ is list of single letter options, using common unix
conventions.
<itemize>
<item><tt/-h/ - show help page
<item><tt/-?/ - the same, of course
<item><tt/-v/, <tt/-V/ - print version of <tt/Xstat/ and exit
<item><tt/-z/ - dump zero counters too. By default they are not shown.
<item><tt/-a/ - dump absolute values of counters. By default <tt/Xstat/
calculates increments since the previous use.
<item><tt/-s/ - do not update history, so that the next time you will
see counters including values accumulated to the moment
of this measurement too.
<item><tt/-n/ - do not display anything, only update history.
<item><tt/-r/ - reset history.
<item><tt/-d INTERVAL/ - <tt/Xstat/ is run in daemon mode collecting
statistics. <tt/INTERVAL/ is interval between measurements
in seconds.
<item><tt/-t INTERVAL/ - time interval to average rates. Default value
is 60 seconds.
<item><tt/-e/ - display extended information about errors (<tt/ifstat/ only).
</itemize>
<p>
History is just dump saved in file <tt>/tmp/.Xstat.uUID</tt>
or in file given by environment variables <tt/NSTAT_HISTORY/,
<tt/IFSTAT_HISTORY/ and <tt/RTACCT_HISTORY/.
Each time when you use <tt/Xstat/ values there are updated.
If you use patterns, only the values which you _really_ see
are updated. If you want to skip an unintersting period,
use option <tt/-n/, or just output to <tt>/dev/null</tt>.
<p>
<tt/Xstat/ understands when history is invalidated by system reboot
or source of information switched between different instances
of daemonic <tt/Xstat/ and kernel SNMP tables and does not
use invalid history.
<p> Beware, <tt/Xstat/ will not produce sane output,
when many processes use it simultaneously. If several processes
under single user need this utility they should use environment
variables to put their history in safe places
or to use it with options <tt/-a -s/.
<p>
Well, that's all. The utility is very simple, but nevertheless
very handy.
<p> <bf/Output of XSTAT/
<p> The first line of output is <tt/#/ followed by identifier
of source of information, it may be word <tt/kernel/, when <tt/Xstat/
gets information from kernel or some dotted decimal number followed
by parameters, when it obtains information from running <tt/Xstat/ daemon.
<p>In the case of <tt/nstat/ the rest of output consists of three columns:
SNMP MIB identifier,
its value (or increment since previous measurement) and average
rate of increase of the counter per second. <tt/ifstat/ outputs
interface name followed by pairs of counter and rate of its change.
<p> <bf/Daemonic Xstat/
<p> <tt/Xstat/ may be started as daemon by any user. This makes sense
to avoid wrapped counters and to obtain reasonable long counters
for large time. Also <tt/Xstat/ daemon calculates average rates.
For the first goal sampling interval (option <tt/-d/) may be large enough,
f.e. for gigabit rates byte counters overflow not more frequently than
each 40 seconds and you may select interval of 20 seconds.
From the other hand, when <tt/Xstat/ is used for estimating rates
interval should be less than averaging period (option <tt/-t/), otherwise
estimation loses in quality.
Client <tt/Xstat/, before trying to get information from the kernel,
contacts daemon started by this user, then it tries system wide
daemon, which is supposed to be started by superuser. And only if
none of them replied it gets information from kernel.
<p> <bf/Environment/
<p> <tt/NSTAT_HISTORY/ - name of history file for <tt/nstat/.
<p> <tt/IFSTAT_HISTORY/ - name of history file for <tt/ifstat/.
<p> <tt/RTACCT_HISTORY/ - name of history file for <tt/rtacct/.
</article>

26
doc/preamble.tex Normal file
View File

@ -0,0 +1,26 @@
\textwidth 6.0in
\textheight 8.5in
\input SNAPSHOT
\pagestyle{myheadings}
\markboth{\protect\TITLE}{}
\markright{{\protect\sc iproute2-ss\Draft}}
% To print it in compact form: both sides on one sheet (psnup -2)
\evensidemargin=\oddsidemargin
\newenvironment{NB}{\bgroup \vskip 1mm\leftskip 1cm \footnotesize \noindent NB.
}{\par\egroup \vskip 1mm}
\def\threeonly{[2.3.15+ only] }
\begin{document}
\makeatletter
\renewcommand{\@oddhead}{{\protect\sc iproute2-ss\Draft} \hfill \protect\arabic{page}}
\makeatother
\let\oldthefootnote\thefootnote
\def\thefootnote{}
\footnotetext{Copyright \copyright~1999 A.N.Kuznetsov}

52
doc/rtstat.sgml Normal file
View File

@ -0,0 +1,52 @@
<!doctype linuxdoc system>
<article>
<title>RTACCT Utility
<author>Robert Olsson
<date>some_negative_number, 20 Dec 2001
<p>
Here is some code for monitoring the route cache. For systems handling high
network load, servers, routers, firewalls etc the route cache and its garbage
collection is crucial. Linux has a solid implementation.
<p>
The kernel patch (not required since linux-2.4.7) adds statistics counters
from route cache process into
/proc/net/rt_cache_stat. A companion user mode program presents the statistics
in a vmstat or iostat manner. The ratio between cache hits and misses gives
the flow length.
<p>
Hopefully it can help understanding performance and DoS and other related
issues.
<p> An URL where newer versions of this utility can be (probably) found
is ftp://robur.slu.se/pub/Linux/net-development/rt_cache_stat/
<p><bf/Description/
<p>The format of the command is:
<tscreen><verb>
rtstat [ OPTIONS ]
</verb></tscreen>
<p> <tt/OPTIONS/ are:
<itemize>
<item><tt/-h/, <tt/-help/ - show help page and version of the utility.
<item><tt/-i INTERVAL/ - interval between snapshots, default value is
2 seconds.
<item><tt/-s NUMBER/ - whether to print header line. 0 inhibits header line,
1 prescribes to print it once and 2 (this is default setting) forces header
line each 20 lines.
</itemize>
</article>

525
doc/ss.sgml Normal file
View File

@ -0,0 +1,525 @@
<!doctype linuxdoc system>
<article>
<title>SS Utility: Quick Intro
<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/
<date>some_negative_number, 20 Sep 2001
<abstract>
<tt/ss/ is one another utility to investigate sockets.
Functionally it is NOT better than <tt/netstat/ combined
with some perl/awk scripts and though it is surely faster
it is not enough to make it much better. :-)
So, stop reading this now and do not waste your time.
Well, certainly, it proposes some functionality, which current
netstat is still not able to do, but surely will soon.
</abstract>
<sect>Why?
<p> <tt>/proc</tt> interface is inadequate, unfortunately.
When amount of sockets is enough large, <tt/netstat/ or even
plain <tt>cat /proc/net/tcp/</tt> cause nothing but pains and curses.
In linux-2.4 the desease became worse: even if amount
of sockets is small reading <tt>/proc/net/tcp/</tt> is slow enough.
This utility presents a new approach, which is supposed to scale
well. I am not going to describe technical details here and
will concentrate on description of the command.
The only important thing to say is that it is not so bad idea
to load module <tt/tcp_diag/, which can be found in directory
<tt/Modules/ of <tt/iproute2/. If you do not make this <tt/ss/
will work, but it falls back to <tt>/proc</tt> and becomes slow
like <tt/netstat/, well, a bit faster yet (see section "Some numbers").
<sect>Old news
<p>
In the simplest form <tt/ss/ is equivalent to netstat
with some small deviations.
<itemize>
<item><tt/ss -t -a/ dumps all TCP sockets
<item><tt/ss -u -a/ dumps all UDP sockets
<item><tt/ss -w -a/ dumps all RAW sockets
<item><tt/ss -x -a/ dumps all UNIX sockets
</itemize>
<p>
Option <tt/-o/ shows TCP timers state.
Option <tt/-e/ shows some extended information.
Etc. etc. etc. Seems, all the options of netstat related to sockets
are supported. Though not AX.25 and other bizarres. :-)
If someone wants, he can make support for decnet and ipx.
Some rudimentary support for them is already present in iproute2 libutils,
and I will be glad to see these new members.
<p>
However, standard functionality is a bit different:
<p>
The first: without option <tt/-a/ sockets in states
<tt/TIME-WAIT/ and <tt/SYN-RECV/ are skipped too.
It is more reasonable default, I think.
<p>
The second: format of UNIX sockets is different. It coincides
with tcp/udp. Though standard kernel still does not allow to
see write/read queues and peer address of connected UNIX sockets,
the patch doing this exists.
<p>
The third: default is to dump only TCP sockets, rather than all of the types.
<p>
The next: by default it does not resolve numeric host addresses (like <tt/ip/)!
Resolving is enabled with option <tt/-r/. Service names, usually stored
in local files, are resolved by default. Also, if service database
does not contain references to a port, <tt/ss/ queries system
<tt/rpcbind/. RPC services are prefixed with <tt/rpc./
Resolution of services may be suppressed with option <tt/-n/.
<p>
It does not accept "long" options (I dislike them, sorry).
So, address family is given with family identifier following
option <tt/-f/ to be algined to iproute2 conventions.
Mostly, it is to allow option parser to parse
addresses correctly, but as side effect it really limits dumping
to sockets supporting only given family. Option <tt/-A/ followed
by list of socket tables to dump is also supported.
Logically, id of socket table is different of _address_ family, which is
another point of incompatibility. So, id is one of
<tt/all/, <tt/tcp/, <tt/udp/,
<tt/raw/, <tt/inet/, <tt/unix/, <tt/packet/, <tt/netlink/. See?
Well, <tt/inet/ is just abbreviation for <tt/tcp|udp|raw/
and it is not difficult to guess that <tt/packet/ allows
to look at packet sockets. Actually, there are also some other abbreviations,
f.e. <tt/unix_dgram/ selects only datagram UNIX sockets.
<p>
The next: well, I still do not know. :-)
<sect>Time to talk about new functionality.
<p>It is builtin filtering of socket lists.
<sect1> Filtering by state.
<p>
<tt/ss/ allows to filter socket states, using keywords
<tt/state/ and <tt/exclude/, followed by some state
identifier.
<p>
State identifier are standard TCP state names (not listed,
they are useless for you if you already do not know them)
or abbreviations:
<itemize>
<item><tt/all/ - for all the states
<item><tt/bucket/ - for TCP minisockets (<tt/TIME-WAIT|SYN-RECV/)
<item><tt/big/ - all except for minisockets
<item><tt/connected/ - not closed and not listening
<item><tt/synchronized/ - connected and not <tt/SYN-SENT/
</itemize>
<p>
F.e. to dump all tcp sockets except <tt/SYN-RECV/:
<tscreen><verb>
ss exclude SYN-RECV
</verb></tscreen>
<p>
If neither <tt/state/ nor <tt/exclude/ directives
are present,
state filter defaults to <tt/all/ with option <tt/-a/
or to <tt/all/,
excluding listening, syn-recv, time-wait and closed sockets.
<sect1> Filtering by addresses and ports.
<p>
Option list may contain address/port filter.
It is boolean expression which consists of boolean operation
<tt/or/, <tt/and/, <tt/not/ and predicates.
Actually, all the flavors of names for boolean operations are eaten:
<tt/&amp/, <tt/&amp&amp/, <tt/|/, <tt/||/, <tt/!/, but do not forget
about special sense given to these symbols by unix shells and escape
them correctly, when used from command line.
<p>
Predicates may be of the folowing kinds:
<itemize>
<item>A. Address/port match, where address is checked against mask
and port is either wildcard or exact. It is one of:
<tscreen><verb>
dst prefix:port
src prefix:port
src unix:STRING
src link:protocol:ifindex
src nl:channel:pid
</verb></tscreen>
Both prefix and port may be absent or replaced with <tt/*/,
which means wildcard. UNIX socket use more powerful scheme
matching to socket names by shell wildcards. Also, prefixes
unix: and link: may be omitted, if address family is evident
from context (with option <tt/-x/ or with <tt/-f unix/
or with <tt/unix/ keyword)
<p>
F.e.
<tscreen><verb>
dst 10.0.0.1
dst 10.0.0.1:
dst 10.0.0.1/32:
dst 10.0.0.1:*
</verb></tscreen>
are equivalent and mean socket connected to
any port on host 10.0.0.1
<tscreen><verb>
dst 10.0.0.0/24:22
</verb></tscreen>
sockets connected to port 22 on network
10.0.0.0...255.
<p>
Note that port separated of address with colon, which creates
troubles with IPv6 addresses. Generally, we interpret the last
colon as splitting port. To allow to give IPv6 addresses,
trick like used in IPv6 HTTP URLs may be used:
<tscreen><verb>
dst [::1]
</verb></tscreen>
are sockets connected to ::1 on any port
<p>
Another way is <tt/dst ::1/128/. / helps to understand that
colon is part of IPv6 address.
<p>
Now we can add another alias for <tt/dst 10.0.0.1/:
<tt/dst [10.0.0.1]/. :-)
<p> Address may be a DNS name. In this case all the addresses are looked
up (in all the address families, if it is not limited by option <tt/-f/
or special address prefix <tt/inet:/, <tt/inet6/) and resulting
expression is <tt/or/ over all of them.
<item> B. Port expressions:
<tscreen><verb>
dport &gt= :1024
dport != :22
sport &lt :32000
</verb></tscreen>
etc.
All the relations: <tt/&lt/, <tt/&gt/, <tt/=/, <tt/>=/, <tt/=/, <tt/==/,
<tt/!=/, <tt/eq/, <tt/ge/, <tt/lt/, <tt/ne/...
Use variant which you like more, but not forget to escape special
characters when typing them in command line. :-)
Note that port number syntactically coincides to the case A!
You may even add an IP address, but it will not participate
incomparison, except for <tt/==/ and <tt/!=/, which are equivalent
to corresponding predicates of type A. F.e.
<p>
<tt/dst 10.0.0.1:22/
is equivalent to <tt/dport eq 10.0.0.1:22/
and
<tt/not dst 10.0.0.1:22/ is equivalent to
<tt/dport neq 10.0.0.1:22/
<item>C. Keyword <tt/autobound/. It matches to sockets bound automatically
on local system.
</itemize>
<sect> Examples
<p>
<itemize>
<item>1. List all the tcp sockets in state <tt/FIN-WAIT-1/ for our apache
to network 193.233.7/24 and look at their timers:
<tscreen><verb>
ss -o state fin-wait-1 \( sport = :http or sport = :https \) \
dst 193.233.7/24
</verb></tscreen>
Oops, forgot to say that missing logical operation is
equivalent to <tt/and/.
<item> 2. Well, now look at the rest...
<tscreen><verb>
ss -o excl fin-wait-1
ss state fin-wait-1 \( sport neq :http and sport neq :https \) \
or not dst 193.233.7/24
</verb></tscreen>
Note that we have to do _two_ calls of ss to do this.
State match is always anded to address/port match.
The reason for this is purely technical: ss does fast skip of
not matching states before parsing addresses and I consider the
ability to skip fastly gobs of time-wait and syn-recv sockets
as more important than logical generality.
<item> 3. So, let's look at all our sockets using autobound ports:
<tscreen><verb>
ss -a -A all autobound
</verb></tscreen>
<item> 4. And eventually find all the local processes connected
to local X servers:
<tscreen><verb>
ss -xp dst "/tmp/.X11-unix/*"
</verb></tscreen>
Pardon, this does not work with current kernel, patching is required.
But we still can look at server side:
<tscreen><verb>
ss -x src "/tmp/.X11-unix/*"
</verb></tscreen>
</itemize>
<sect> Returning to ground: real manual
<p>
<sect1> Command arguments
<p> General format of arguments to <tt/ss/ is:
<tscreen><verb>
ss [ OPTIONS ] [ STATE-FILTER ] [ ADDRESS-FILTER ]
</verb></tscreen>
<sect2><tt/OPTIONS/
<p> <tt/OPTIONS/ is list of single letter options, using common unix
conventions.
<itemize>
<item><tt/-h/ - show help page
<item><tt/-?/ - the same, of course
<item><tt/-v/, <tt/-V/ - print version of <tt/ss/ and exit
<item><tt/-s/ - print summary statistics. This option does not parse
socket lists obtaining summary from various sources. It is useful
when amount of sockets is so huge that parsing <tt>/proc/net/tcp</tt>
is painful.
<item><tt/-D FILE/ - do not display anything, just dump raw information
about TCP sockets to <tt/FILE/ after applying filters. If <tt/FILE/ is <tt/-/
<tt/stdout/ is used.
<item><tt/-F FILE/ - read continuation of filter from <tt/FILE/.
Each line of <tt/FILE/ is interpreted like single command line option.
If <tt/FILE/ is <tt/-/ <tt/stdin/ is used.
<item><tt/-r/ - try to resolve numeric address/ports
<item><tt/-n/ - do not try to resolve ports
<item><tt/-o/ - show some optional information, f.e. TCP timers
<item><tt/-i/ - show some infomration specific to TCP (RTO, congestion
window, slow start threshould etc.)
<item><tt/-e/ - show even more optional information
<item><tt/-m/ - show extended information on memory used by the socket.
It is available only with <tt/tcp_diag/ enabled.
<item><tt/-p/ - show list of processes owning the socket
<item><tt/-f FAMILY/ - default address family used for parsing addresses.
Also this option limits listing to sockets supporting
given address family. Currently the following families
are supported: <tt/unix/, <tt/inet/, <tt/inet6/, <tt/link/,
<tt/netlink/.
<item><tt/-4/ - alias for <tt/-f inet/
<item><tt/-6/ - alias for <tt/-f inet6/
<item><tt/-0/ - alias for <tt/-f link/
<item><tt/-A LIST-OF-TABLES/ - list of socket tables to dump, separated
by commas. The following identifiers are understood:
<tt/all/, <tt/inet/, <tt/tcp/, <tt/udp/, <tt/raw/,
<tt/unix/, <tt/packet/, <tt/netlink/, <tt/unix_dgram/,
<tt/unix_stream/, <tt/packet_raw/, <tt/packet_dgram/.
<item><tt/-x/ - alias for <tt/-A unix/
<item><tt/-t/ - alias for <tt/-A tcp/
<item><tt/-u/ - alias for <tt/-A udp/
<item><tt/-w/ - alias for <tt/-A raw/
<item><tt/-a/ - show sockets of all the states. By default sockets
in states <tt/LISTEN/, <tt/TIME-WAIT/, <tt/SYN_RECV/
and <tt/CLOSE/ are skipped.
<item><tt/-l/ - show only sockets in state <tt/LISTEN/
</itemize>
<sect2><tt/STATE-FILTER/
<p><tt/STATE-FILTER/ allows to construct arbitrary set of
states to match. Its syntax is sequence of keywords <tt/state/
and <tt/exclude/ followed by identifier of state.
Available identifiers are:
<p>
<itemize>
<item> All standard TCP states: <tt/established/, <tt/syn-sent/,
<tt/syn-recv/, <tt/fin-wait-1/, <tt/fin-wait-2/, <tt/time-wait/,
<tt/closed/, <tt/close-wait/, <tt/last-ack/, <tt/listen/ and <tt/closing/.
<item><tt/all/ - for all the states
<item><tt/connected/ - all the states except for <tt/listen/ and <tt/closed/
<item><tt/synchronized/ - all the <tt/connected/ states except for
<tt/syn-sent/
<item><tt/bucket/ - states, which are maintained as minisockets, i.e.
<tt/time-wait/ and <tt/syn-recv/.
<item><tt/big/ - opposite to <tt/bucket/
</itemize>
<sect2><tt/ADDRESS_FILTER/
<p><tt/ADDRESS_FILTER/ is boolean expression with operations <tt/and/, <tt/or/
and <tt/not/, which can be abbreviated in C style f.e. as <tt/&amp/,
<tt/&amp&amp/.
<p>
Predicates check socket addresses, both local and remote.
There are the following kinds of predicates:
<itemize>
<item> <tt/dst ADDRESS_PATTERN/ - matches remote address and port
<item> <tt/src ADDRESS_PATTERN/ - matches local address and port
<item> <tt/dport RELOP PORT/ - compares remote port to a number
<item> <tt/sport RELOP PORT/ - compares local port to a number
<item> <tt/autobound/ - checks that socket is bound to an ephemeral
port
</itemize>
<p><tt/RELOP/ is some of <tt/&lt=/, <tt/&gt=/, <tt/==/ etc.
To make this more convinient for use in unix shell, alphabetic
FORTRAN-like notations <tt/le/, <tt/gt/ etc. are accepted as well.
<p>The format and semantics of <tt/ADDRESS_PATTERN/ depends on address
family.
<itemize>
<item><tt/inet/ - <tt/ADDRESS_PATTERN/ consists of IP prefix, optionally
followed by colon and port. If prefix or port part is absent or replaced
with <tt/*/, this means wildcard match.
<item><tt/inet6/ - The same as <tt/inet/, only prefix refers to an IPv6
address. Unlike <tt/inet/ colon becomes ambiguous, so that <tt/ss/ allows
to use scheme, like used in URLs, where address is suppounded with
<tt/[/ ... <tt/]/.
<item><tt/unix/ - <tt/ADDRESS_PATTERN/ is shell-style wildcard.
<item><tt/packet/ - format looks like <tt/inet/, only interface index
stays instead of port and link layer protocol id instead of address.
<item><tt/netlink/ - format looks like <tt/inet/, only socket pid
stays instead of port and netlink channel instead of address.
</itemize>
<p><tt/PORT/ is syntactically <tt/ADDRESS_PATTERN/ with wildcard
address part. Certainly, it is undefined for UNIX sockets.
<sect1> Environment variables
<p>
<tt/ss/ allows to change source of information using various
environment variables:
<p>
<itemize>
<item> <tt/PROC_SLABINFO/ to override <tt>/proc/slabinfo</tt>
<item> <tt/PROC_NET_TCP/ to override <tt>/proc/net/tcp</tt>
<item> <tt/PROC_NET_UDP/ to override <tt>/proc/net/udp</tt>
<item> etc.
</itemize>
<p>
Variable <tt/PROC_ROOT/ allows to change root of all the <tt>/proc/</tt>
hierarchy.
<p>
Variable <tt/TCPDIAG_FILE/ prescribes to open a file instead of
requesting kernel to dump information about TCP sockets.
<p> This option is used mainly to investigate bug reports,
when dumps of files usually found in <tt>/proc/</tt> are recevied
by e-mail.
<sect1> Output format
<p>Six columns. The first is <tt/Netid/, it denotes socket type and
transport protocol, when it is ambiguous: <tt/tcp/, <tt/udp/, <tt/raw/,
<tt/u_str/ is abbreviation for <tt/unix_stream/, <tt/u_dgr/ for UNIX
datagram sockets, <tt/nl/ for netlink, <tt/p_raw/ and <tt/p_dgr/ for
raw and datagram packet sockets. This column is optional, it will
be hidden, if filter selects an unique netid.
<p>
The second column is <tt/State/. Socket state is displayed here.
The names are standard TCP names, except for <tt/UNCONN/, which
cannot happen for TCP, but normal for not connected sockets
of another types. Again, this column can be hidden.
<p>
Then two columns (<tt/Recv-Q/ and <tt/Send-Q/) showing amount of data
queued for receive and transmit.
<p>
And the last two columns display local address and port of the socket
and its peer address, if the socket is connected.
<p>
If options <tt/-o/, <tt/-e/ or <tt/-p/ were given, options are
displayed not in fixed positions but separated by spaces pairs:
<tt/option:value/. If value is not a single number, it is presented
as list of values, enclosed to <tt/(/ ... <tt/)/ and separated with
commas. F.e.
<tscreen><verb>
timer:(keepalive,111min,0)
</verb></tscreen>
is typical format for TCP timer (option <tt/-o/).
<tscreen><verb>
users:((X,113,3))
</verb></tscreen>
is typical for list of users (option <tt/-p/).
<sect>Some numbers
<p>
Well, let us use <tt/pidentd/ and a tool <tt/ibench/ to measure
its performance. It is 30 requests per second here. Nothing to test,
it is too slow. OK, let us patch pidentd with patch from directory
Patches. After this it handles about 4300 requests per second
and becomes handy tool to pollute socket tables with lots of timewait
buckets.
<p>
So, each test starts from pollution tables with 30000 sockets
and then doing full dump of the table piped to wc and measuring
timings with time:
<p>Results:
<itemize>
<item> <tt/netstat -at/ - 15.6 seconds
<item> <tt/ss -atr/, but without <tt/tcp_diag/ - 5.4 seconds
<item> <tt/ss -atr/ with <tt/tcp_diag/ - 0.47 seconds
</itemize>
No comments. Though one comment is necessary, most of time
without <tt/tcp_diag/ is wasted inside kernel with completely
blocked networking. More than 10 seconds, yes. <tt/tcp_diag/
does the same work for 100 milliseconds of system time.
</article>

View File

@ -1,6 +0,0 @@
#
# subpath mappings from mount point for pinning
#
#3 tracing
#4 foo/bar
#5 tc/cls1

View File

@ -5,4 +5,3 @@
4 meta
7 canid
8 ipset
9 ipt

View File

@ -12,7 +12,7 @@
9 audit
10 fiblookup
11 connector
12 nft
12 nft
13 ip6fw
14 dec-rt
15 uevent
@ -20,4 +20,4 @@
18 scsi-trans
19 ecryptfs
20 rdma
21 crypto
21 crypto

View File

@ -14,12 +14,18 @@
13 dnrouted
14 xorp
15 ntk
16 dhcp
18 keepalived
16 dhcp
42 babel
99 openr
186 bgp
187 isis
188 ospf
189 rip
192 eigrp
#
# Used by me for gated
#
254 gated/aggr
253 gated/bgp
252 gated/ospf
251 gated/ospfase
250 gated/rip
249 gated/static
248 gated/conn
247 gated/inet
246 gated/default

View File

@ -1,2 +0,0 @@
Each file in this directory is an rt_protos configuration file. iproute2
commands scan this directory processing all files that end in '.conf'.

View File

@ -1,2 +0,0 @@
Each file in this directory is an rt_tables configuration file. iproute2
commands scan this directory processing all files that end in '.conf'.

122
examples/README.cbq Normal file
View File

@ -0,0 +1,122 @@
# CHANGES
# -------
# v0.3a2- fixed bug in "if" operator. Thanks kad@dgtu.donetsk.ua.
# v0.3a- added TIME parameter. Example:
# TIME=00:00-19:00;64Kbit/6Kbit
# So, between 00:00 and 19:00 RATE will be 64Kbit.
# Just start "cbq.init timecheck" periodically from cron (every 10
# minutes for example).
# !!! Anyway you MUST start "cbq.init start" for CBQ initialize.
# v0.2 - Some cosmetique changes. Now it more compatible with
# old bash version. Thanks to Stanislav V. Voronyi
# <stas@cnti.uanet.kharkov.ua>.
# v0.1 - First public release
#
# README
# ------
#
# First of all - this is just a SIMPLE EXAMPLE of CBQ power.
# Don't ask me "why" and "how" :)
#
# This is an example of using CBQ (Class Based Queueing) and policy-based
# filter for building smart ethernet shapers. All CBQ parameters are
# correct only for ETHERNET (eth0,1,2..) linux interfaces. It works for
# ARCNET too (just set bandwidth parameter to 2Mbit). It was tested
# on 2.1.125-2.1.129 linux kernels (KSI linux, Nostromo version) and
# ip-route utility by A.Kuznetsov (iproute2-ss981101 version).
# You can download ip-route from ftp://ftp.inr.ac.ru/ip-routing or
# get iproute2*.rpm (compiled with glibc) from ftp.ksi-linux.com.
#
#
# HOW IT WORKS
#
# Each shaper must be described by config file in $CBQ_PATH
# (/etc/sysconfig/cbq/) directory - one config file for each CBQ shaper.
#
# Some words about config file name:
# Each shaper has its personal ID - two byte HEX number. Really ID is
# CBQ class.
# So, filename looks like:
#
# cbq-1280.My_first_shaper
# ^^^ ^^^ ^^^^^^^^^^^^^
# | | |______ Shaper name - any word
# | |___________________ ID (0000-FFFF), let ID looks like shaper's rate
# |______________________ Filename must begin from "cbq-"
#
#
# Config file describes shaper parameters and source[destination]
# address[port].
# For example let's prepare /etc/sysconfig/cbq/cbq-1280.My_first_shaper:
#
# ----------8<---------------------
# DEVICE=eth0,10Mbit,1Mbit
# RATE=128Kbit
# WEIGHT=10Kbit
# PRIO=5
# RULE=192.168.1.0/24
# ----------8<---------------------
#
# This is minimal configuration, where:
# DEVICE: eth0 - device where we do control our traffic
# 10Mbit - REAL ethernet card bandwidth
# 1Mbit - "weight" of :1 class (parent for all shapers for eth0),
# as a rule of thumb weight=batdwidth/10.
# 100Mbit adapter's example: DEVICE=eth0,100Mbit,10Mbit
# *** If you want to build more than one shaper per device it's
# enough to describe bandwidth and weight once - cbq.init
# is smart :) You can put only 'DEVICE=eth0' into cbq-*
# config file for eth0.
#
# RATE: Shaper's speed - Kbit,Mbit or bps (bytes per second)
#
# WEIGHT: "weight" of shaper (CBQ class). Like for DEVICE - approx. RATE/10
#
# PRIO: shaper's priority from 1 to 8 where 1 is the highest one.
# I do always use "5" for all my shapers.
#
# RULE: [source addr][:source port],[dest addr][:dest port]
# Some examples:
# RULE=10.1.1.0/24:80 - all traffic for network 10.1.1.0 to port 80
# will be shaped.
# RULE=10.2.2.5 - shaper works only for IP address 10.2.2.5
# RULE=:25,10.2.2.128/25:5000 - all traffic from any address and port 25 to
# address 10.2.2.128 - 10.2.2.255 and port 5000
# will be shaped.
# RULE=10.5.5.5:80, - shaper active only for traffic from port 80 of
# address 10.5.5.5
# Multiple RULE fields per one config file are allowed. For example:
# RULE=10.1.1.2:80
# RULE=10.1.1.2:25
# RULE=10.1.1.2:110
#
# *** ATTENTION!!!
# All shapers do work only for outgoing traffic!
# So, if you want to build bidirectional shaper you must set it up for
# both ethernet card. For example let's build shaper for our linux box like:
#
# --------- 192.168.1.1
# BACKBONE -----eth0-| linux |-eth1------*[our client]
# ---------
#
# Let all traffic from backbone to client will be shaped at 28Kbit and
# traffic from client to backbone - at 128Kbit. We need two config files:
#
# ---8<-----/etc/sysconfig/cbq/cbq-28.client-out----
# DEVICE=eth1,10Mbit,1Mbit
# RATE=28Kbit
# WEIGHT=2Kbit
# PRIO=5
# RULE=192.168.1.1
# ---8<---------------------------------------------
#
# ---8<-----/etc/sysconfig/cbq/cbq-128.client-in----
# DEVICE=eth0,10Mbit,1Mbit
# RATE=128Kbit
# WEIGHT=10Kbit
# PRIO=5
# RULE=192.168.1.1,
# ---8<---------------------------------------------
# ^pay attention to "," - this is source address!
#
# Enjoy.

View File

@ -0,0 +1,49 @@
#! /bin/sh -x
#
# sample script on using the ingress capabilities
# this script shows how one can rate limit incoming SYNs
# Useful for TCP-SYN attack protection. You can use
# IPchains to have more powerful additions to the SYN (eg
# in addition the subnet)
#
#path to various utilities;
#change to reflect yours.
#
IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
TC=$IPROUTE/tc/tc
IP=$IPROUTE/ip/ip
IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
INDEV=eth2
#
# tag all incoming SYN packets through $INDEV as mark value 1
############################################################
$IPCHAINS -A input -i $INDEV -y -m 1
############################################################
#
# install the ingress qdisc on the ingress interface
############################################################
$TC qdisc add dev $INDEV handle ffff: ingress
############################################################
#
#
# SYN packets are 40 bytes (320 bits) so three SYNs equals
# 960 bits (approximately 1kbit); so we rate limit below
# the incoming SYNs to 3/sec (not very sueful really; but
#serves to show the point - JHS
############################################################
$TC filter add dev $INDEV parent ffff: protocol ip prio 50 handle 1 fw \
police rate 1kbit burst 40 mtu 9k drop flowid :1
############################################################
#
echo "---- qdisc parameters Ingress ----------"
$TC qdisc ls dev $INDEV
echo "---- Class parameters Ingress ----------"
$TC class ls dev $INDEV
echo "---- filter parameters Ingress ----------"
$TC filter ls dev $INDEV parent ffff:
#deleting the ingress qdisc
#$TC qdisc del $INDEV ingress

View File

@ -1,18 +0,0 @@
eBPF toy code examples (running in kernel) to familiarize yourself
with syntax and features:
- BTF defined map examples
- bpf_graft.c -> Demo on altering runtime behaviour
- bpf_shared.c -> Ingress/egress map sharing example
- bpf_map_in_map.c -> Using map in map example
- legacy struct bpf_elf_map defined map examples
- legacy/bpf_shared.c -> Ingress/egress map sharing example
- legacy/bpf_tailcall.c -> Using tail call chains
- legacy/bpf_cyclic.c -> Simple cycle as tail calls
- legacy/bpf_graft.c -> Demo on altering runtime behaviour
- legacy/bpf_map_in_map.c -> Using map in map example
Note: Users should use new BTF way to defined the maps, the examples
in legacy folder which is using struct bpf_elf_map defined maps is not
recommanded.

258
examples/bpf/bpf_agent.c Normal file
View File

@ -0,0 +1,258 @@
/*
* eBPF user space agent part
*
* Simple, _self-contained_ user space agent for the eBPF kernel
* ebpf_prog.c program, which gets all map fds passed from tc via unix
* domain socket in one transaction and can thus keep referencing
* them from user space in order to read out (or possibly modify)
* map data. Here, just as a minimal example to display counters.
*
* The agent only uses the bpf(2) syscall API to read or possibly
* write to eBPF maps, it doesn't need to be aware of the low-level
* bytecode parts and/or ELF parsing bits.
*
* ! For more details, see header comment in bpf_prog.c !
*
* gcc bpf_agent.c -o bpf_agent -Wall -O2
*
* For example, a more complex user space agent could run on each
* host, reading and writing into eBPF maps used by tc classifier
* and actions. It would thus allow for implementing a distributed
* tc architecture, for example, which would push down central
* policies into eBPF maps, and thus altering run-time behaviour.
*
* -- Happy eBPF hacking! ;)
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <stdint.h>
#include <assert.h>
#include <sys/un.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
/* Just some misc macros as min(), offsetof(), etc. */
#include "../../include/utils.h"
/* Common code from fd passing. */
#include "../../include/bpf_scm.h"
/* Common, shared definitions with ebpf_prog.c */
#include "bpf_shared.h"
/* Mini syscall wrapper */
#include "bpf_sys.h"
static void bpf_dump_drops(int fd)
{
int cpu, max;
max = sysconf(_SC_NPROCESSORS_ONLN);
printf(" `- number of drops:");
for (cpu = 0; cpu < max; cpu++) {
long drops;
assert(bpf_lookup_elem(fd, &cpu, &drops) == 0);
printf("\tcpu%d: %5ld", cpu, drops);
}
printf("\n");
}
static void bpf_dump_queue(int fd)
{
/* Just for the same of the example. */
int max_queue = 4, i;
printf(" | nic queues:");
for (i = 0; i < max_queue; i++) {
struct count_queue cq;
int ret;
memset(&cq, 0, sizeof(cq));
ret = bpf_lookup_elem(fd, &i, &cq);
assert(ret == 0 || (ret < 0 && errno == ENOENT));
printf("\tq%d:[pkts: %ld, mis: %ld]",
i, cq.total, cq.mismatch);
}
printf("\n");
}
static void bpf_dump_proto(int fd)
{
uint8_t protos[] = { IPPROTO_TCP, IPPROTO_UDP, IPPROTO_ICMP };
char *names[] = { "tcp", "udp", "icmp" };
int i;
printf(" ` protos:");
for (i = 0; i < ARRAY_SIZE(protos); i++) {
struct count_tuple ct;
int ret;
memset(&ct, 0, sizeof(ct));
ret = bpf_lookup_elem(fd, &protos[i], &ct);
assert(ret == 0 || (ret < 0 && errno == ENOENT));
printf("\t%s:[pkts: %ld, bytes: %ld]",
names[i], ct.packets, ct.bytes);
}
printf("\n");
}
static void bpf_dump_map_data(int *tfd)
{
int i;
for (i = 0; i < 30; i++) {
const int period = 5;
printf("data, period: %dsec\n", period);
bpf_dump_drops(tfd[BPF_MAP_ID_DROPS]);
bpf_dump_queue(tfd[BPF_MAP_ID_QUEUE]);
bpf_dump_proto(tfd[BPF_MAP_ID_PROTO]);
sleep(period);
}
}
static void bpf_info_loop(int *fds, struct bpf_map_aux *aux)
{
int i, tfd[BPF_MAP_ID_MAX];
printf("ver: %d\nobj: %s\ndev: %lu\nino: %lu\nmaps: %u\n",
aux->uds_ver, aux->obj_name, aux->obj_st.st_dev,
aux->obj_st.st_ino, aux->num_ent);
for (i = 0; i < aux->num_ent; i++) {
printf("map%d:\n", i);
printf(" `- fd: %u\n", fds[i]);
printf(" | serial: %u\n", aux->ent[i].id);
printf(" | type: %u\n", aux->ent[i].type);
printf(" | max elem: %u\n", aux->ent[i].max_elem);
printf(" | size key: %u\n", aux->ent[i].size_key);
printf(" ` size val: %u\n", aux->ent[i].size_value);
tfd[aux->ent[i].id] = fds[i];
}
bpf_dump_map_data(tfd);
}
static void bpf_map_get_from_env(int *tfd)
{
char key[64], *val;
int i;
for (i = 0; i < BPF_MAP_ID_MAX; i++) {
memset(key, 0, sizeof(key));
snprintf(key, sizeof(key), "BPF_MAP%d", i);
val = secure_getenv(key);
assert(val != NULL);
tfd[i] = atoi(val);
}
}
static int bpf_map_set_recv(int fd, int *fds, struct bpf_map_aux *aux,
unsigned int entries)
{
struct bpf_map_set_msg msg;
int *cmsg_buf, min_fd, i;
char *amsg_buf, *mmsg_buf;
cmsg_buf = bpf_map_set_init(&msg, NULL, 0);
amsg_buf = (char *)msg.aux.ent;
mmsg_buf = (char *)&msg.aux;
for (i = 0; i < entries; i += min_fd) {
struct cmsghdr *cmsg;
int ret;
min_fd = min(BPF_SCM_MAX_FDS * 1U, entries - i);
bpf_map_set_init_single(&msg, min_fd);
ret = recvmsg(fd, &msg.hdr, 0);
if (ret <= 0)
return ret ? : -1;
cmsg = CMSG_FIRSTHDR(&msg.hdr);
if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS)
return -EINVAL;
if (msg.hdr.msg_flags & MSG_CTRUNC)
return -EIO;
min_fd = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof(fd);
if (min_fd > entries || min_fd <= 0)
return -1;
memcpy(&fds[i], cmsg_buf, sizeof(fds[0]) * min_fd);
memcpy(&aux->ent[i], amsg_buf, sizeof(aux->ent[0]) * min_fd);
memcpy(aux, mmsg_buf, offsetof(struct bpf_map_aux, ent));
if (i + min_fd == aux->num_ent)
break;
}
return 0;
}
int main(int argc, char **argv)
{
int fds[BPF_SCM_MAX_FDS];
struct bpf_map_aux aux;
struct sockaddr_un addr;
int fd, ret, i;
/* When arguments are being passed, we take it as a path
* to a Unix domain socket, otherwise we grab the fds
* from the environment to demonstrate both possibilities.
*/
if (argc == 1) {
int tfd[BPF_MAP_ID_MAX];
bpf_map_get_from_env(tfd);
bpf_dump_map_data(tfd);
return 0;
}
fd = socket(AF_UNIX, SOCK_DGRAM, 0);
if (fd < 0) {
fprintf(stderr, "Cannot open socket: %s\n",
strerror(errno));
exit(1);
}
memset(&addr, 0, sizeof(addr));
addr.sun_family = AF_UNIX;
strncpy(addr.sun_path, argv[argc - 1], sizeof(addr.sun_path));
ret = bind(fd, (struct sockaddr *)&addr, sizeof(addr));
if (ret < 0) {
fprintf(stderr, "Cannot bind to socket: %s\n",
strerror(errno));
exit(1);
}
memset(fds, 0, sizeof(fds));
memset(&aux, 0, sizeof(aux));
ret = bpf_map_set_recv(fd, fds, &aux, BPF_SCM_MAX_FDS);
if (ret >= 0)
bpf_info_loop(fds, &aux);
for (i = 0; i < aux.num_ent; i++)
close(fds[i]);
close(fd);
return 0;
}

58
examples/bpf/bpf_funcs.h Normal file
View File

@ -0,0 +1,58 @@
#ifndef __BPF_FUNCS__
#define __BPF_FUNCS__
/* Misc macros. */
#ifndef __maybe_unused
# define __maybe_unused __attribute__ ((__unused__))
#endif
#ifndef __section
# define __section(NAME) __attribute__((section(NAME), used))
#endif
#ifndef offsetof
# define offsetof __builtin_offsetof
#endif
#ifndef htons
# define htons(x) __constant_htons((x))
#endif
#ifndef likely
# define likely(x) __builtin_expect(!!(x), 1)
#endif
#ifndef unlikely
# define unlikely(x) __builtin_expect(!!(x), 0)
#endif
/* The verifier will translate them to actual function calls. */
static void *(*bpf_map_lookup_elem)(void *map, void *key) __maybe_unused =
(void *) BPF_FUNC_map_lookup_elem;
static int (*bpf_map_update_elem)(void *map, void *key, void *value,
unsigned long long flags) __maybe_unused =
(void *) BPF_FUNC_map_update_elem;
static int (*bpf_map_delete_elem)(void *map, void *key) __maybe_unused =
(void *) BPF_FUNC_map_delete_elem;
static unsigned int (*get_smp_processor_id)(void) __maybe_unused =
(void *) BPF_FUNC_get_smp_processor_id;
static unsigned int (*get_prandom_u32)(void) __maybe_unused =
(void *) BPF_FUNC_get_prandom_u32;
/* LLVM built-in functions that an eBPF C program may use to emit
* BPF_LD_ABS and BPF_LD_IND instructions.
*/
unsigned long long load_byte(void *skb, unsigned long long off)
asm ("llvm.bpf.load.byte");
unsigned long long load_half(void *skb, unsigned long long off)
asm ("llvm.bpf.load.half");
unsigned long long load_word(void *skb, unsigned long long off)
asm ("llvm.bpf.load.word");
#endif /* __BPF_FUNCS__ */

View File

@ -1,66 +0,0 @@
#include "../../include/bpf_api.h"
/* This example demonstrates how classifier run-time behaviour
* can be altered with tail calls. We start out with an empty
* jmp_tc array, then add section aaa to the array slot 0, and
* later on atomically replace it with section bbb. Note that
* as shown in other examples, the tc loader can prepopulate
* tail called sections, here we start out with an empty one
* on purpose to show it can also be done this way.
*
* tc filter add dev foo parent ffff: bpf obj graft.o
* tc exec bpf dbg
* [...]
* Socket Thread-20229 [001] ..s. 138993.003923: : fallthrough
* <idle>-0 [001] ..s. 138993.202265: : fallthrough
* Socket Thread-20229 [001] ..s. 138994.004149: : fallthrough
* [...]
*
* tc exec bpf graft m:globals/jmp_tc key 0 obj graft.o sec aaa
* tc exec bpf dbg
* [...]
* Socket Thread-19818 [002] ..s. 139012.053587: : aaa
* <idle>-0 [002] ..s. 139012.172359: : aaa
* Socket Thread-19818 [001] ..s. 139012.173556: : aaa
* [...]
*
* tc exec bpf graft m:globals/jmp_tc key 0 obj graft.o sec bbb
* tc exec bpf dbg
* [...]
* Socket Thread-19818 [002] ..s. 139022.102967: : bbb
* <idle>-0 [002] ..s. 139022.155640: : bbb
* Socket Thread-19818 [001] ..s. 139022.156730: : bbb
* [...]
*/
struct {
__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
__uint(key_size, sizeof(uint32_t));
__uint(value_size, sizeof(uint32_t));
__uint(max_entries, 1);
__uint(pinning, LIBBPF_PIN_BY_NAME);
} jmp_tc __section(".maps");
__section("aaa")
int cls_aaa(struct __sk_buff *skb)
{
printt("aaa\n");
return TC_H_MAKE(1, 42);
}
__section("bbb")
int cls_bbb(struct __sk_buff *skb)
{
printt("bbb\n");
return TC_H_MAKE(1, 43);
}
__section_cls_entry
int cls_entry(struct __sk_buff *skb)
{
tail_call(skb, &jmp_tc, 0);
printt("fallthrough\n");
return BPF_H_DEFAULT;
}
BPF_LICENSE("GPL");

View File

@ -1,55 +0,0 @@
#include "../../include/bpf_api.h"
struct inner_map {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(key_size, sizeof(uint32_t));
__uint(value_size, sizeof(uint32_t));
__uint(max_entries, 1);
} map_inner __section(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
__uint(key_size, sizeof(uint32_t));
__uint(value_size, sizeof(uint32_t));
__uint(max_entries, 1);
__uint(pinning, LIBBPF_PIN_BY_NAME);
__array(values, struct inner_map);
} map_outer __section(".maps") = {
.values = {
[0] = &map_inner,
},
};
__section("egress")
int emain(struct __sk_buff *skb)
{
struct bpf_elf_map *map_inner;
int key = 0, *val;
map_inner = map_lookup_elem(&map_outer, &key);
if (map_inner) {
val = map_lookup_elem(map_inner, &key);
if (val)
lock_xadd(val, 1);
}
return BPF_H_DEFAULT;
}
__section("ingress")
int imain(struct __sk_buff *skb)
{
struct bpf_elf_map *map_inner;
int key = 0, *val;
map_inner = map_lookup_elem(&map_outer, &key);
if (map_inner) {
val = map_lookup_elem(map_inner, &key);
if (val)
printt("map val: %d\n", *val);
}
return BPF_H_DEFAULT;
}
BPF_LICENSE("GPL");

496
examples/bpf/bpf_prog.c Normal file
View File

@ -0,0 +1,496 @@
/*
* eBPF kernel space program part
*
* Toy eBPF program for demonstration purposes, some parts derived from
* kernel tree's samples/bpf/sockex2_kern.c example.
*
* More background on eBPF, kernel tree: Documentation/networking/filter.txt
*
* Note, this file is rather large, and most classifier and actions are
* likely smaller to accomplish one specific use-case and are tailored
* for high performance. For performance reasons, you might also have the
* classifier and action already merged inside the classifier.
*
* In order to show various features it serves as a bigger programming
* example, which you should feel free to rip apart and experiment with.
*
* Compilation, configuration example:
*
* Note: as long as the BPF backend in LLVM is still experimental,
* you need to build LLVM with LLVM with --enable-experimental-targets=BPF
* Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y,
* and you have libelf.h and gelf.h headers and can link tc against -lelf.
*
* In case you need to sync kernel headers, go to your kernel source tree:
* # make headers_install INSTALL_HDR_PATH=/usr/
*
* $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH
* $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o
* $ objdump -h bpf.o
* [...]
* 3 classifier 000007f8 0000000000000000 0000000000000000 00000040 2**3
* CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
* 4 action-mark 00000088 0000000000000000 0000000000000000 00000838 2**3
* CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
* 5 action-rand 00000098 0000000000000000 0000000000000000 000008c0 2**3
* CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
* 6 maps 00000030 0000000000000000 0000000000000000 00000958 2**2
* CONTENTS, ALLOC, LOAD, DATA
* 7 license 00000004 0000000000000000 0000000000000000 00000988 2**0
* CONTENTS, ALLOC, LOAD, DATA
* [...]
* # echo 1 > /proc/sys/net/core/bpf_jit_enable
* $ gcc bpf_agent.c -o bpf_agent -Wall -O2
* # ./bpf_agent /tmp/bpf-uds (e.g. on a different terminal)
* # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
* action bpf obj bpf.o sec action-mark \
* action bpf obj bpf.o sec action-rand ok
* # tc filter show dev em1
* filter parent 1: protocol all pref 49152 bpf
* filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier]
* action order 1: bpf bpf.o:[action-mark] default-action pipe
* index 52 ref 1 bind 1
*
* action order 2: bpf bpf.o:[action-rand] default-action pipe
* index 53 ref 1 bind 1
*
* action order 3: gact action pass
* random type none pass val 0
* index 38 ref 1 bind 1
*
* The same program can also be installed on ingress side (as opposed to above
* egress configuration), e.g.:
*
* # tc qdisc add dev em1 handle ffff: ingress
* # tc filter add dev em1 parent ffff: bpf obj ...
*
* Notes on BPF agent:
*
* In the above example, the bpf_agent creates the unix domain socket
* natively. "tc exec" can also spawn a shell and hold the socktes there:
*
* # tc exec bpf imp /tmp/bpf-uds
* # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
* action bpf obj bpf.o sec action-mark \
* action bpf obj bpf.o sec action-rand ok
* sh-4.2# (shell spawned from tc exec)
* sh-4.2# bpf_agent
* [...]
*
* This will read out fds over environment and produce the same data dump
* as below. This has the advantage that the spawned shell owns the fds
* and thus if the agent is restarted, it can reattach to the same fds, also
* various programs can easily read/modify the data simultaneously from user
* space side.
*
* If the shell is unnecessary, the agent can also just be spawned directly
* via tc exec:
*
* # tc exec bpf imp /tmp/bpf-uds run bpf_agent
* # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
* action bpf obj bpf.o sec action-mark \
* action bpf obj bpf.o sec action-rand ok
*
* BPF agent example output:
*
* ver: 1
* obj: bpf.o
* dev: 64770
* ino: 6045133
* maps: 3
* map0:
* `- fd: 4
* | serial: 1
* | type: 1
* | max elem: 256
* | size key: 1
* ` size val: 16
* map1:
* `- fd: 5
* | serial: 2
* | type: 1
* | max elem: 1024
* | size key: 4
* ` size val: 16
* map2:
* `- fd: 6
* | serial: 3
* | type: 2
* | max elem: 64
* | size key: 4
* ` size val: 8
* data, period: 5sec
* `- number of drops: cpu0: 0 cpu1: 0 cpu2: 0 cpu3: 0
* | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 0, mis: 0] q3:[pkts: 0, mis: 0]
* ` protos: tcp:[pkts: 0, bytes: 0] udp:[pkts: 0, bytes: 0] icmp:[pkts: 0, bytes: 0]
* data, period: 5sec
* `- number of drops: cpu0: 5 cpu1: 0 cpu2: 0 cpu3: 1
* | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 24, mis: 14] q3:[pkts: 0, mis: 0]
* ` protos: tcp:[pkts: 13, bytes: 1989] udp:[pkts: 10, bytes: 710] icmp:[pkts: 0, bytes: 0]
* data, period: 5sec
* `- number of drops: cpu0: 5 cpu1: 0 cpu2: 3 cpu3: 3
* | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 39, mis: 21] q3:[pkts: 0, mis: 0]
* ` protos: tcp:[pkts: 20, bytes: 3549] udp:[pkts: 18, bytes: 1278] icmp:[pkts: 0, bytes: 0]
* [...]
*
* This now means, the below classifier and action pipeline has been loaded
* as eBPF bytecode into the kernel, the kernel has verified that the
* execution of the bytecode is "safe", and it has JITed the programs
* afterwards, so that upon invocation they're running on native speed. tc
* has transferred all map file descriptors to the bpf_agent via IPC and
* even after tc exits, the agent can read out or modify all map data.
*
* Note that the export to the uds is done only once in the classifier and
* not in the action. It's enough to export the (here) shared descriptors
* once.
*
* If you need to disassemble the generated JIT image (echo with 2), the
* kernel tree has under tools/net/ a small helper, you can invoke e.g.
* `bpf_jit_disasm -o`.
*
* Please find in the code below further comments.
*
* -- Happy eBPF hacking! ;)
*/
#include <stdint.h>
#include <stdbool.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <asm/types.h>
#include <linux/in.h>
#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/if_tunnel.h>
#include <linux/filter.h>
#include <linux/bpf.h>
/* Common, shared definitions with ebpf_agent.c. */
#include "bpf_shared.h"
/* Selection of BPF helper functions for our example. */
#include "bpf_funcs.h"
/* Could be defined here as well, or included from the header. */
#define TC_ACT_UNSPEC (-1)
#define TC_ACT_OK 0
#define TC_ACT_RECLASSIFY 1
#define TC_ACT_SHOT 2
#define TC_ACT_PIPE 3
#define TC_ACT_STOLEN 4
#define TC_ACT_QUEUED 5
#define TC_ACT_REPEAT 6
/* Other, misc stuff. */
#define IP_MF 0x2000
#define IP_OFFSET 0x1FFF
/* eBPF map definitions, all placed in section "maps". */
struct bpf_elf_map __section("maps") map_proto = {
.type = BPF_MAP_TYPE_HASH,
.id = BPF_MAP_ID_PROTO,
.size_key = sizeof(uint8_t),
.size_value = sizeof(struct count_tuple),
.max_elem = 256,
};
struct bpf_elf_map __section("maps") map_queue = {
.type = BPF_MAP_TYPE_HASH,
.id = BPF_MAP_ID_QUEUE,
.size_key = sizeof(uint32_t),
.size_value = sizeof(struct count_queue),
.max_elem = 1024,
};
struct bpf_elf_map __section("maps") map_drops = {
.type = BPF_MAP_TYPE_ARRAY,
.id = BPF_MAP_ID_DROPS,
.size_key = sizeof(uint32_t),
.size_value = sizeof(long),
.max_elem = 64,
};
/* Helper functions and definitions for the flow dissector used by the
* example classifier. This resembles the kernel's flow dissector to
* some extend and is just used as an example to show what's possible
* with eBPF.
*/
struct sockaddr;
struct vlan_hdr {
__be16 h_vlan_TCI;
__be16 h_vlan_encapsulated_proto;
};
struct flow_keys {
__u32 src;
__u32 dst;
union {
__u32 ports;
__u16 port16[2];
};
__s32 th_off;
__u8 ip_proto;
};
static inline int flow_ports_offset(__u8 ip_proto)
{
switch (ip_proto) {
case IPPROTO_TCP:
case IPPROTO_UDP:
case IPPROTO_DCCP:
case IPPROTO_ESP:
case IPPROTO_SCTP:
case IPPROTO_UDPLITE:
default:
return 0;
case IPPROTO_AH:
return 4;
}
}
static inline bool flow_is_frag(struct __sk_buff *skb, int nh_off)
{
return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) &
(IP_MF | IP_OFFSET));
}
static inline int flow_parse_ipv4(struct __sk_buff *skb, int nh_off,
__u8 *ip_proto, struct flow_keys *flow)
{
__u8 ip_ver_len;
if (unlikely(flow_is_frag(skb, nh_off)))
*ip_proto = 0;
else
*ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr,
protocol));
if (*ip_proto != IPPROTO_GRE) {
flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr));
flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr));
}
ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */);
if (likely(ip_ver_len == 0x45))
nh_off += 20;
else
nh_off += (ip_ver_len & 0xF) << 2;
return nh_off;
}
static inline __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off)
{
__u32 w0 = load_word(skb, off);
__u32 w1 = load_word(skb, off + sizeof(w0));
__u32 w2 = load_word(skb, off + sizeof(w0) * 2);
__u32 w3 = load_word(skb, off + sizeof(w0) * 3);
return w0 ^ w1 ^ w2 ^ w3;
}
static inline int flow_parse_ipv6(struct __sk_buff *skb, int nh_off,
__u8 *ip_proto, struct flow_keys *flow)
{
*ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr));
flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr));
flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr));
return nh_off + sizeof(struct ipv6hdr);
}
static inline bool flow_dissector(struct __sk_buff *skb,
struct flow_keys *flow)
{
int poff, nh_off = BPF_LL_OFF + ETH_HLEN;
__be16 proto = skb->protocol;
__u8 ip_proto;
/* TODO: check for skb->vlan_tci, skb->vlan_proto first */
if (proto == htons(ETH_P_8021AD)) {
proto = load_half(skb, nh_off +
offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
nh_off += sizeof(struct vlan_hdr);
}
if (proto == htons(ETH_P_8021Q)) {
proto = load_half(skb, nh_off +
offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
nh_off += sizeof(struct vlan_hdr);
}
if (likely(proto == htons(ETH_P_IP)))
nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
else if (proto == htons(ETH_P_IPV6))
nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
else
return false;
switch (ip_proto) {
case IPPROTO_GRE: {
struct gre_hdr {
__be16 flags;
__be16 proto;
};
__u16 gre_flags = load_half(skb, nh_off +
offsetof(struct gre_hdr, flags));
__u16 gre_proto = load_half(skb, nh_off +
offsetof(struct gre_hdr, proto));
if (gre_flags & (GRE_VERSION | GRE_ROUTING))
break;
nh_off += 4;
if (gre_flags & GRE_CSUM)
nh_off += 4;
if (gre_flags & GRE_KEY)
nh_off += 4;
if (gre_flags & GRE_SEQ)
nh_off += 4;
if (gre_proto == ETH_P_8021Q) {
gre_proto = load_half(skb, nh_off +
offsetof(struct vlan_hdr,
h_vlan_encapsulated_proto));
nh_off += sizeof(struct vlan_hdr);
}
if (gre_proto == ETH_P_IP)
nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
else if (gre_proto == ETH_P_IPV6)
nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
else
return false;
break;
}
case IPPROTO_IPIP:
nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
break;
case IPPROTO_IPV6:
nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
default:
break;
}
nh_off += flow_ports_offset(ip_proto);
flow->ports = load_word(skb, nh_off);
flow->th_off = nh_off;
flow->ip_proto = ip_proto;
return true;
}
static inline void cls_update_proto_map(const struct __sk_buff *skb,
const struct flow_keys *flow)
{
uint8_t proto = flow->ip_proto;
struct count_tuple *ct, _ct;
ct = bpf_map_lookup_elem(&map_proto, &proto);
if (likely(ct)) {
__sync_fetch_and_add(&ct->packets, 1);
__sync_fetch_and_add(&ct->bytes, skb->len);
return;
}
/* No hit yet, we need to create a new entry. */
_ct.packets = 1;
_ct.bytes = skb->len;
bpf_map_update_elem(&map_proto, &proto, &_ct, BPF_ANY);
}
static inline void cls_update_queue_map(const struct __sk_buff *skb)
{
uint32_t queue = skb->queue_mapping;
struct count_queue *cq, _cq;
bool mismatch;
mismatch = skb->queue_mapping != get_smp_processor_id();
cq = bpf_map_lookup_elem(&map_queue, &queue);
if (likely(cq)) {
__sync_fetch_and_add(&cq->total, 1);
if (mismatch)
__sync_fetch_and_add(&cq->mismatch, 1);
return;
}
/* No hit yet, we need to create a new entry. */
_cq.total = 1;
_cq.mismatch = mismatch ? 1 : 0;
bpf_map_update_elem(&map_queue, &queue, &_cq, BPF_ANY);
}
/* eBPF program definitions, placed in various sections, which can
* have custom section names. If custom names are in use, it's
* required to point tc to the correct section, e.g.
*
* tc filter add [...] bpf obj cls.o sec cls-tos [...]
*
* in case the program resides in __section("cls-tos").
*
* Default section for cls_bpf is: "classifier", for act_bpf is:
* "action". Naturally, if for example multiple actions are present
* in the same file, they need to have distinct section names.
*
* It is however not required to have multiple programs sharing
* a file.
*/
__section("classifier") int cls_main(struct __sk_buff *skb)
{
struct flow_keys flow;
if (!flow_dissector(skb, &flow))
return 0; /* No match in cls_bpf. */
cls_update_proto_map(skb, &flow);
cls_update_queue_map(skb);
return flow.ip_proto;
}
static inline void act_update_drop_map(void)
{
uint32_t *count, cpu = get_smp_processor_id();
count = bpf_map_lookup_elem(&map_drops, &cpu);
if (count)
/* Only this cpu is accessing this element. */
(*count)++;
}
__section("action-mark") int act_mark_main(struct __sk_buff *skb)
{
/* You could also mangle skb data here with the helper function
* BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could
* do that already in the classifier itself as a merged combination
* of classifier'n'action model.
*/
if (skb->mark == 0xcafe) {
act_update_drop_map();
return TC_ACT_SHOT;
}
/* Default configured tc opcode. */
return TC_ACT_UNSPEC;
}
__section("action-rand") int act_rand_main(struct __sk_buff *skb)
{
/* Sorry, we're near event horizon ... */
if ((get_prandom_u32() & 3) == 0) {
act_update_drop_map();
return TC_ACT_SHOT;
}
return TC_ACT_UNSPEC;
}
/* Last but not least, the file contains a license. Some future helper
* functions may only be available with a GPL license.
*/
char __license[] __section("license") = "GPL";

View File

@ -1,53 +0,0 @@
#include "../../include/bpf_api.h"
/* Minimal, stand-alone toy map pinning example:
*
* clang -target bpf -O2 [...] -o bpf_shared.o -c bpf_shared.c
* tc filter add dev foo parent 1: bpf obj bpf_shared.o sec egress
* tc filter add dev foo parent ffff: bpf obj bpf_shared.o sec ingress
*
* Both classifier will share the very same map instance in this example,
* so map content can be accessed from ingress *and* egress side!
*
* This example has a pinning of PIN_OBJECT_NS, so it's private and
* thus shared among various program sections within the object.
*
* A setting of PIN_GLOBAL_NS would place it into a global namespace,
* so that it can be shared among different object files. A setting
* of PIN_NONE (= 0) means no sharing, so each tc invocation a new map
* instance is being created.
*/
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(key_size, sizeof(uint32_t));
__uint(value_size, sizeof(uint32_t));
__uint(max_entries, 1);
__uint(pinning, LIBBPF_PIN_BY_NAME); /* or LIBBPF_PIN_NONE */
} map_sh __section(".maps");
__section("egress")
int emain(struct __sk_buff *skb)
{
int key = 0, *val;
val = map_lookup_elem(&map_sh, &key);
if (val)
lock_xadd(val, 1);
return BPF_H_DEFAULT;
}
__section("ingress")
int imain(struct __sk_buff *skb)
{
int key = 0, *val;
val = map_lookup_elem(&map_sh, &key);
if (val)
printt("map val: %d\n", *val);
return BPF_H_DEFAULT;
}
BPF_LICENSE("GPL");

26
examples/bpf/bpf_shared.h Normal file
View File

@ -0,0 +1,26 @@
#ifndef __BPF_SHARED__
#define __BPF_SHARED__
#include <stdint.h>
#include "../../include/bpf_elf.h"
enum {
BPF_MAP_ID_PROTO,
BPF_MAP_ID_QUEUE,
BPF_MAP_ID_DROPS,
__BPF_MAP_ID_MAX,
#define BPF_MAP_ID_MAX __BPF_MAP_ID_MAX
};
struct count_tuple {
long packets; /* type long for __sync_fetch_and_add() */
long bytes;
};
struct count_queue {
long total;
long mismatch;
};
#endif /* __BPF_SHARED__ */

23
examples/bpf/bpf_sys.h Normal file
View File

@ -0,0 +1,23 @@
#ifndef __BPF_SYS__
#define __BPF_SYS__
#include <sys/syscall.h>
#include <linux/bpf.h>
static inline __u64 bpf_ptr_to_u64(const void *ptr)
{
return (__u64) (unsigned long) ptr;
}
static inline int bpf_lookup_elem(int fd, void *key, void *value)
{
union bpf_attr attr = {
.map_fd = fd,
.key = bpf_ptr_to_u64(key),
.value = bpf_ptr_to_u64(value),
};
return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}
#endif /* __BPF_SYS__ */

View File

@ -1,35 +0,0 @@
#include "../../../include/bpf_api.h"
/* Cyclic dependency example to test the kernel's runtime upper
* bound on loops. Also demonstrates on how to use direct-actions,
* loaded as: tc filter add [...] bpf da obj [...]
*/
#define JMP_MAP_ID 0xabccba
struct bpf_elf_map __section_maps jmp_tc = {
.type = BPF_MAP_TYPE_PROG_ARRAY,
.id = JMP_MAP_ID,
.size_key = sizeof(uint32_t),
.size_value = sizeof(uint32_t),
.pinning = PIN_OBJECT_NS,
.max_elem = 1,
};
__section_tail(JMP_MAP_ID, 0)
int cls_loop(struct __sk_buff *skb)
{
printt("cb: %u\n", skb->cb[0]++);
tail_call(skb, &jmp_tc, 0);
skb->tc_classid = TC_H_MAKE(1, 42);
return TC_ACT_OK;
}
__section_cls_entry
int cls_entry(struct __sk_buff *skb)
{
tail_call(skb, &jmp_tc, 0);
return TC_ACT_SHOT;
}
BPF_LICENSE("GPL");

View File

@ -1,66 +0,0 @@
#include "../../../include/bpf_api.h"
/* This example demonstrates how classifier run-time behaviour
* can be altered with tail calls. We start out with an empty
* jmp_tc array, then add section aaa to the array slot 0, and
* later on atomically replace it with section bbb. Note that
* as shown in other examples, the tc loader can prepopulate
* tail called sections, here we start out with an empty one
* on purpose to show it can also be done this way.
*
* tc filter add dev foo parent ffff: bpf obj graft.o
* tc exec bpf dbg
* [...]
* Socket Thread-20229 [001] ..s. 138993.003923: : fallthrough
* <idle>-0 [001] ..s. 138993.202265: : fallthrough
* Socket Thread-20229 [001] ..s. 138994.004149: : fallthrough
* [...]
*
* tc exec bpf graft m:globals/jmp_tc key 0 obj graft.o sec aaa
* tc exec bpf dbg
* [...]
* Socket Thread-19818 [002] ..s. 139012.053587: : aaa
* <idle>-0 [002] ..s. 139012.172359: : aaa
* Socket Thread-19818 [001] ..s. 139012.173556: : aaa
* [...]
*
* tc exec bpf graft m:globals/jmp_tc key 0 obj graft.o sec bbb
* tc exec bpf dbg
* [...]
* Socket Thread-19818 [002] ..s. 139022.102967: : bbb
* <idle>-0 [002] ..s. 139022.155640: : bbb
* Socket Thread-19818 [001] ..s. 139022.156730: : bbb
* [...]
*/
struct bpf_elf_map __section_maps jmp_tc = {
.type = BPF_MAP_TYPE_PROG_ARRAY,
.size_key = sizeof(uint32_t),
.size_value = sizeof(uint32_t),
.pinning = PIN_GLOBAL_NS,
.max_elem = 1,
};
__section("aaa")
int cls_aaa(struct __sk_buff *skb)
{
printt("aaa\n");
return TC_H_MAKE(1, 42);
}
__section("bbb")
int cls_bbb(struct __sk_buff *skb)
{
printt("bbb\n");
return TC_H_MAKE(1, 43);
}
__section_cls_entry
int cls_entry(struct __sk_buff *skb)
{
tail_call(skb, &jmp_tc, 0);
printt("fallthrough\n");
return BPF_H_DEFAULT;
}
BPF_LICENSE("GPL");

View File

@ -1,56 +0,0 @@
#include "../../../include/bpf_api.h"
#define MAP_INNER_ID 42
struct bpf_elf_map __section_maps map_inner = {
.type = BPF_MAP_TYPE_ARRAY,
.size_key = sizeof(uint32_t),
.size_value = sizeof(uint32_t),
.id = MAP_INNER_ID,
.inner_idx = 0,
.pinning = PIN_GLOBAL_NS,
.max_elem = 1,
};
struct bpf_elf_map __section_maps map_outer = {
.type = BPF_MAP_TYPE_ARRAY_OF_MAPS,
.size_key = sizeof(uint32_t),
.size_value = sizeof(uint32_t),
.inner_id = MAP_INNER_ID,
.pinning = PIN_GLOBAL_NS,
.max_elem = 1,
};
__section("egress")
int emain(struct __sk_buff *skb)
{
struct bpf_elf_map *map_inner;
int key = 0, *val;
map_inner = map_lookup_elem(&map_outer, &key);
if (map_inner) {
val = map_lookup_elem(map_inner, &key);
if (val)
lock_xadd(val, 1);
}
return BPF_H_DEFAULT;
}
__section("ingress")
int imain(struct __sk_buff *skb)
{
struct bpf_elf_map *map_inner;
int key = 0, *val;
map_inner = map_lookup_elem(&map_outer, &key);
if (map_inner) {
val = map_lookup_elem(map_inner, &key);
if (val)
printt("map val: %d\n", *val);
}
return BPF_H_DEFAULT;
}
BPF_LICENSE("GPL");

View File

@ -1,53 +0,0 @@
#include "../../../include/bpf_api.h"
/* Minimal, stand-alone toy map pinning example:
*
* clang -target bpf -O2 [...] -o bpf_shared.o -c bpf_shared.c
* tc filter add dev foo parent 1: bpf obj bpf_shared.o sec egress
* tc filter add dev foo parent ffff: bpf obj bpf_shared.o sec ingress
*
* Both classifier will share the very same map instance in this example,
* so map content can be accessed from ingress *and* egress side!
*
* This example has a pinning of PIN_OBJECT_NS, so it's private and
* thus shared among various program sections within the object.
*
* A setting of PIN_GLOBAL_NS would place it into a global namespace,
* so that it can be shared among different object files. A setting
* of PIN_NONE (= 0) means no sharing, so each tc invocation a new map
* instance is being created.
*/
struct bpf_elf_map __section_maps map_sh = {
.type = BPF_MAP_TYPE_ARRAY,
.size_key = sizeof(uint32_t),
.size_value = sizeof(uint32_t),
.pinning = PIN_OBJECT_NS, /* or PIN_GLOBAL_NS, or PIN_NONE */
.max_elem = 1,
};
__section("egress")
int emain(struct __sk_buff *skb)
{
int key = 0, *val;
val = map_lookup_elem(&map_sh, &key);
if (val)
lock_xadd(val, 1);
return BPF_H_DEFAULT;
}
__section("ingress")
int imain(struct __sk_buff *skb)
{
int key = 0, *val;
val = map_lookup_elem(&map_sh, &key);
if (val)
printt("map val: %d\n", *val);
return BPF_H_DEFAULT;
}
BPF_LICENSE("GPL");

View File

@ -1,117 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include "../../../include/bpf_api.h"
#define ENTRY_INIT 3
#define ENTRY_0 0
#define ENTRY_1 1
#define MAX_JMP_SIZE 2
#define FOO 42
#define BAR 43
/* This example doesn't really do anything useful, but it's purpose is to
* demonstrate eBPF tail calls on a very simple example.
*
* cls_entry() is our classifier entry point, from there we jump based on
* skb->hash into cls_case1() or cls_case2(). They are both part of the
* program array jmp_tc. Indicated via __section_tail(), the tc loader
* populates the program arrays with the loaded file descriptors already.
*
* To demonstrate nested jumps, cls_case2() jumps within the same jmp_tc
* array to cls_case1(). And whenever we arrive at cls_case1(), we jump
* into cls_exit(), part of the jump array jmp_ex.
*
* Also, to show it's possible, all programs share map_sh and dump the value
* that the entry point incremented. The sections that are loaded into a
* program array can be atomically replaced during run-time, e.g. to change
* classifier behaviour.
*/
struct bpf_elf_map __section_maps jmp_tc = {
.type = BPF_MAP_TYPE_PROG_ARRAY,
.id = FOO,
.size_key = sizeof(uint32_t),
.size_value = sizeof(uint32_t),
.pinning = PIN_OBJECT_NS,
.max_elem = MAX_JMP_SIZE,
};
struct bpf_elf_map __section_maps jmp_ex = {
.type = BPF_MAP_TYPE_PROG_ARRAY,
.id = BAR,
.size_key = sizeof(uint32_t),
.size_value = sizeof(uint32_t),
.pinning = PIN_OBJECT_NS,
.max_elem = 1,
};
struct bpf_elf_map __section_maps map_sh = {
.type = BPF_MAP_TYPE_ARRAY,
.size_key = sizeof(uint32_t),
.size_value = sizeof(uint32_t),
.pinning = PIN_OBJECT_NS,
.max_elem = 1,
};
__section_tail(FOO, ENTRY_0)
int cls_case1(struct __sk_buff *skb)
{
int key = 0, *val;
val = map_lookup_elem(&map_sh, &key);
if (val)
printt("case1: map-val: %d from:%u\n", *val, skb->cb[0]);
skb->cb[0] = ENTRY_0;
tail_call(skb, &jmp_ex, ENTRY_0);
return BPF_H_DEFAULT;
}
__section_tail(FOO, ENTRY_1)
int cls_case2(struct __sk_buff *skb)
{
int key = 0, *val;
val = map_lookup_elem(&map_sh, &key);
if (val)
printt("case2: map-val: %d from:%u\n", *val, skb->cb[0]);
skb->cb[0] = ENTRY_1;
tail_call(skb, &jmp_tc, ENTRY_0);
return BPF_H_DEFAULT;
}
__section_tail(BAR, ENTRY_0)
int cls_exit(struct __sk_buff *skb)
{
int key = 0, *val;
val = map_lookup_elem(&map_sh, &key);
if (val)
printt("exit: map-val: %d from:%u\n", *val, skb->cb[0]);
/* Termination point. */
return BPF_H_DEFAULT;
}
__section_cls_entry
int cls_entry(struct __sk_buff *skb)
{
int key = 0, *val;
/* For transferring state, we can use skb->cb[0] ... skb->cb[4]. */
val = map_lookup_elem(&map_sh, &key);
if (val) {
lock_xadd(val, 1);
skb->cb[0] = ENTRY_INIT;
tail_call(skb, &jmp_tc, skb->hash & (MAX_JMP_SIZE - 1));
}
printt("fallthrough\n");
return BPF_H_DEFAULT;
}
BPF_LICENSE("GPL");

983
examples/cbq.init-v0.7.3 Normal file
View File

@ -0,0 +1,983 @@
#!/bin/bash
#
# cbq.init v0.7.3
# Copyright (C) 1999 Pavel Golubev <pg@ksi-linux.com>
# Copyright (C) 2001-2004 Lubomir Bulej <pallas@kadan.cz>
#
# chkconfig: 2345 11 89
# description: sets up CBQ-based traffic control
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
#
# To get the latest version, check on Freshmeat for actual location:
#
# http://freshmeat.net/projects/cbq.init
#
#
# VERSION HISTORY
# ---------------
# v0.7.3- Deepak Singhal <singhal at users.sourceforge.net>
# - fix timecheck to not ignore regular TIME rules after
# encountering a TIME rule that spans over midnight
# - Nathan Shafer <nicodemus at users.sourceforge.net>
# - allow symlinks to class files
# - Seth J. Blank <antifreeze at users.sourceforge.net>
# - replace hardcoded ip/tc location with variables
# - Mark Davis <mark.davis at gmx.de>
# - allow setting of PRIO_{MARK,RULE,REALM} in class file
# - Fernando Sanch <toptnc at users.sourceforge.net>
# - allow underscores in interface names
# v0.7.2- Paulo Sedrez
# - fix time2abs to allow hours with leading zero in TIME rules
# - Svetlin Simeonov <zvero at yahoo.com>
# - fix cbq_device_list to allow VLAN interfaces
# - Mark Davis <mark.davis at gmx.de>
# - ignore *~ backup files when looking for classes
# - Mike Boyer <boyer at administrative.com>
# - fix to allow arguments to be passed to "restart" command
# v0.7.1- Lubomir Bulej <pallas at kadan.cz>
# - default value for PERTURB
# - fixed small bug in RULE parser to correctly parse rules with
# identical source and destination fields
# - faster initial scanning of DEVICE fields
# v0.7 - Lubomir Bulej <pallas at kadan.cz>
# - lots of various cleanups and reorganizations; the parsing is now
# some 40% faster, but the class ID must be in range 0x0002-0xffff
# (again). Because of the number of internal changes and the above
# class ID restriction, I bumped the version to 0.7 to indicate
# something might have got broken :)
# - changed PRIO_{U32,FW,ROUTE} to PRIO_{RULE,MARK,REALM}
# for consistency with filter keywords
# - exposed "compile" command
# - Catalin Petrescu <taz at dntis.ro>
# - support for port masks in RULE (u32) filter
# - Jordan Vrtanoski <obeliks at mt.net.mk>
# - support for week days in TIME rules
# v0.6.4- Lubomir Bulej <pallas at kadan.cz>
# - added PRIO_* variables to allow easy control of filter priorities
# - added caching to speed up CBQ start, the cache is invalidated
# whenever any of the configuration files changes
# - updated the readme section + some cosmetic fixes
# v0.6.3- Lubomir Bulej <pallas at kadan.cz>
# - removed setup of (unnecessary) class 1:1 - all classes
# now use qdisc's default class 1:0 as their parent
# - minor fix in the timecheck branch - classes
# without leaf qdisc were not updated
# - minor fix to avoid timecheck failure when run
# at time with minutes equal to 08 or 09
# - respect CBQ_PATH setting in environment
# - made PRIO=5 default, rendering it optional in configs
# - added support for route filter, see notes about REALM keyword
# - added support for fw filter, see notes about MARK keyword
# - added filter display to "list" and "stats" commands
# - readme section update + various cosmetic fixes
# v0.6.2- Catalin Petrescu <taz at dntis.ro>
# - added tunnels interface handling
# v0.6.1- Pavel Golubev <pg at ksi-linux.com>
# - added sch_prio module loading
# (thanks johan at iglo.virtual.or.id for reminding)
# - resolved errors resulting from stricter syntax checking in bash2
# - Lubomir Bulej <pallas at kadan.cz>
# - various cosmetic fixes
# v0.6 - Lubomir Bulej <pallas at kadan.cz>
# - attempt to limit number of spawned processes by utilizing
# more of sed power (use sed instead of grep+cut)
# - simplified TIME parser, using bash builtins
# - added initial support for SFQ as leaf qdisc
# - reworked the documentation part a little
# - incorporated pending patches and ideas submitted by
# following people for versions 0.3 into version 0.6
# - Miguel Freitas <miguel at cetuc.puc-rio.br>
# - in case of overlapping TIME parameters, the last match is taken
# - Juanjo Ciarlante <jjo at mendoza.gov.ar>
# - chkconfig tags, list + stats startup parameters
# - optional tc & ip command logging (into /var/run/cbq-*)
# - Rafal Maszkowski <rzm at icm.edu.pl>
# - PEAK parameter for setting TBF's burst peak rate
# - fix for many config files (use find instead of ls)
# v0.5.1- Lubomir Bulej <pallas at kadan.cz>
# - fixed little but serious bug in RULE parser
# v0.5 - Lubomir Bulej <pallas at kadan.cz>
# - added options PARENT, LEAF, ISOLATED and BOUNDED. This allows
# (with some attention to config file ordering) for creating
# hierarchical structures of shapers with classes able (or unable)
# to borrow bandwidth from their parents.
# - class ID check allows hexadecimal numbers
# - rewritten & simplified RULE parser
# - cosmetic changes to improve readability
# - reorganization to avoid duplicate code (timecheck etc.)
# - timecheck doesn't check classes without TIME fields anymore
# v0.4 - Lubomir Bulej <pallas at kadan.cz>
# - small bugfix in RULE parsing code
# - simplified configuration parsing code
# - several small cosmetic changes
# - TIME parameter can be now specified more than once allowing you to
# differentiate RATE throughout the whole day. Time overlapping is
# not checked, first match is taken. Midnight wrap (eg. 20:00-6:00)
# is allowed and taken care of.
# v0.3a4- fixed small bug in IF operator. Thanks to
# Rafal Maszkowski <rzm at icm.edu.pl>
# v0.3a3- fixed grep bug when using more than 10 eth devices. Thanks to David
# Trcka <trcka at poda.cz>.
# v0.3a2- fixed bug in "if" operator. Thanks kad at dgtu.donetsk.ua.
# v0.3a - added TIME parameter. Example: TIME=00:00-19:00;64Kbit/6Kbit
# So, between 00:00 and 19:00 the RATE will be 64Kbit.
# Just start "cbq.init timecheck" periodically from cron
# (every 10 minutes for example). DON'T FORGET though, to run
# "cbq.init start" for CBQ to initialize.
# v0.2 - Some cosmetic changes. Now it is more compatible with old bash
# version. Thanks to Stanislav V. Voronyi <stas at cnti.uanet.kharkov.ua>.
# v0.1 - First public release
#
#
# README
# ------
#
# First of all - this is just a SIMPLE EXAMPLE of CBQ power.
# Don't ask me "why" and "how" :)
#
# This script is meant to simplify setup and management of relatively simple
# CBQ-based traffic control on Linux. Access to advanced networking features
# of Linux kernel is provided by "ip" and "tc" utilities from A. Kuznetsov's
# iproute2 package, available at ftp://ftp.inr.ac.ru/ip-routing. Because the
# utilities serve primarily to translate user wishes to RTNETLINK commands,
# their interface is rather spartan, intolerant and requires quite a lot of
# typing. And typing is what this script attempts to reduce :)
#
# The advanced networking stuff in Linux is pretty flexible and this script
# aims to bring some of its features to the not-so-hard-core Linux users. Of
# course, there is a tradeoff between simplicity and flexibility and you may
# realize that the flexibility suffered too much for your needs -- time to
# face "ip" and "tc" interface.
#
# To speed up the "start" command, simple caching was introduced in version
# 0.6.4. The caching works so that the sequence of "tc" commands for given
# configuration is stored in a file (/var/cache/cbq.init by default) which
# is used next time the "start" command is run to avoid repeated parsing of
# configuration files. This cache is invalidated whenever any of the CBQ
# configuration files changes. If you want to run "cbq.init start" without
# caching, run it as "cbq.init start nocache". If you want to force cache
# invalidation, run it as "cbq.init start invalidate". Caching is disabled
# if you have logging enabled (ie. CBQ_DEBUG is not empty).
#
# If you only want cqb.init to translate your configuration to "tc" commands,
# use "compile" command which will output "tc" commands required to build
# your configuration. Bear in mind that "compile" does not check if the "tc"
# commands were successful - this is done (in certain places) only when the
# "start nocache" command is used, which is also useful when creating the
# configuration to check whether it is completely valid.
#
# All CBQ parameters are valid for Ethernet interfaces only, The script was
# tested on various Linux kernel versions from series 2.1 to 2.4 and several
# distributions with KSI Linux (Nostromo version) as the premier one.
#
#
# HOW DOES IT WORK?
# -----------------
#
# Every traffic class must be described by a file in the $CBQ_PATH directory
# (/etc/sysconfig/cbq by default) - one file per class.
#
# The config file names must obey mandatory format: cbq-<clsid>.<name> where
# <clsid> is two-byte hexadecimal number in range <0002-FFFF> (which in fact
# is a CBQ class ID) and <name> is the name of the class -- anything to help
# you distinguish the configuration files. For small amount of classes it is
# often possible (and convenient) to let <clsid> resemble bandwidth of the
# class.
#
# Example of valid config name:
# cbq-1280.My_first_shaper
#
#
# The configuration file may contain the following parameters:
#
### Device parameters
#
# DEVICE=<ifname>,<bandwidth>[,<weight>] mandatory
# DEVICE=eth0,10Mbit,1Mbit
#
# <ifname> is the name of the interface you want to control
# traffic on, e.g. eth0
# <bandwidth> is the physical bandwidth of the device, e.g. for
# ethernet 10Mbit or 100Mbit, for arcnet 2Mbit
# <weight> is tuning parameter that should be proportional to
# <bandwidth>. As a rule of thumb: <weight> = <bandwidth> / 10
#
# When you have more classes on one interface, it is enough to specify
# <bandwidth> [and <weight>] only once, therefore in other files you only
# need to set DEVICE=<ifname>.
#
### Class parameters
#
# RATE=<speed> mandatory
# RATE=5Mbit
#
# Bandwidth allocated to the class. Traffic going through the class is
# shaped to conform to specified rate. You can use Kbit, Mbit or bps,
# Kbps and Mbps as suffices. If you don't specify any unit, bits/sec
# are used. Also note that "bps" means "bytes per second", not bits.
#
# WEIGHT=<speed> mandatory
# WEIGHT=500Kbit
#
# Tuning parameter that should be proportional to RATE. As a rule
# of thumb, use WEIGHT ~= RATE / 10.
#
# PRIO=<1-8> optional, default 5
# PRIO=5
#
# Priority of class traffic. The higher the number, the lesser
# the priority. Priority of 5 is just fine.
#
# PARENT=<clsid> optional, default not set
# PARENT=1280
#
# Specifies ID of the parent class to which you want this class be
# attached. You might want to use LEAF=none for the parent class as
# mentioned below. By using this parameter and carefully ordering the
# configuration files, it is possible to create simple hierarchical
# structures of CBQ classes. The ordering is important so that parent
# classes are constructed prior to their children.
#
# LEAF=none|tbf|sfq optional, default "tbf"
#
# Tells the script to attach specified leaf queueing discipline to CBQ
# class. By default, TBF is used. Note that attaching TBF to CBQ class
# shapes the traffic to conform to TBF parameters and prevents the class
# from borrowing bandwidth from its parent even if you have BOUNDED set
# to "no". To allow the class to borrow bandwith (provided it is not
# bounded), you must set LEAF to "none" or "sfq".
#
# If you want to ensure (approximately) fair sharing of bandwidth among
# several hosts in the same class, you might want to specify LEAF=sfq to
# attach SFQ as leaf queueing discipline to that class.
#
# BOUNDED=yes|no optional, default "yes"
#
# If set to "yes", the class is not allowed to borrow bandwidth from
# its parent class in overlimit situation. If set to "no", the class
# will be allowed to borrow bandwidth from its parent.
#
# Note: Don't forget to set LEAF to "none" or "sfq", otherwise the class will
# have TBF attached to itself and will not be able to borrow unused
# bandwith from its parent.
#
# ISOLATED=yes|no optional, default "no"
#
# If set to "yes", the class will not lend unused bandwidth to
# its children.
#
### TBF qdisc parameters
#
# BUFFER=<bytes>[/<bytes>] optional, default "10Kb/8"
#
# This parameter controls the depth of the token bucket. In other
# words it represents the maximal burst size the class can send.
# The optional part of parameter is used to determine the length
# of intervals in packet sizes, for which the transmission times
# are kept.
#
# LIMIT=<bytes> optional, default "15Kb"
#
# This parameter determines the maximal length of backlog. If
# the queue contains more data than specified by LIMIT, the
# newly arriving packets are dropped. The length of backlog
# determines queue latency in case of congestion.
#
# PEAK=<speed> optional, default not set
#
# Maximal peak rate for short-term burst traffic. This allows you
# to control the absolute peak rate the class can send at, because
# single TBF that allows 256Kbit/s would of course allow rate of
# 512Kbit for half a second or 1Mbit for a quarter of second.
#
# MTU=<bytes> optional, default "1500"
#
# Maximum number of bytes that can be sent at once over the
# physical medium. This parameter is required when you specify
# PEAK parameter. It defaults to MTU of ethernet - for other
# media types you might want to change it.
#
# Note: Setting TBF as leaf qdisc will effectively prevent the class from
# borrowing bandwidth from the ancestor class, because even if the
# class allows more traffic to pass through, it is then shaped to
# conform to TBF.
#
### SFQ qdisc parameters
#
# The SFQ queueing discipline is a cheap way for sharing class bandwidth
# among several hosts. As it is stochastic, the fairness is approximate but
# it will do the job in most cases. If you want real fairness, you should
# probably use WRR (weighted round robin) or WFQ queueing disciplines. Note
# that SFQ does not do any traffic shaping - the shaping is done by the CBQ
# class the SFQ is attached to.
#
# QUANTUM=<bytes> optional, default not set
#
# This parameter should not be set lower than link MTU, for ethernet
# it is 1500b, or (with MAC header) 1514b which is the value used
# in Alexey Kuznetsov's examples.
#
# PERTURB=<seconds> optional, default "10"
#
# Period of hash function perturbation. If unset, hash reconfiguration
# will never take place which is what you probably don't want. The
# default value of 10 seconds is probably a good one.
#
### Filter parameters
#
# RULE=[[saddr[/prefix]][:port[/mask]],][daddr[/prefix]][:port[/mask]]
#
# These parameters make up "u32" filter rules that select traffic for
# each of the classes. You can use multiple RULE fields per config.
#
# The optional port mask should only be used by advanced users who
# understand how the u32 filter works.
#
# Some examples:
#
# RULE=10.1.1.0/24:80
# selects traffic going to port 80 in network 10.1.1.0
#
# RULE=10.2.2.5
# selects traffic going to any port on single host 10.2.2.5
#
# RULE=10.2.2.5:20/0xfffe
# selects traffic going to ports 20 and 21 on host 10.2.2.5
#
# RULE=:25,10.2.2.128/26:5000
# selects traffic going from anywhere on port 50 to
# port 5000 in network 10.2.2.128
#
# RULE=10.5.5.5:80,
# selects traffic going from port 80 of single host 10.5.5.5
#
#
#
# REALM=[srealm,][drealm]
#
# These parameters make up "route" filter rules that classify traffic
# according to packet source/destination realms. For information about
# realms, see Alexey Kuznetsov's IP Command Reference. This script
# does not define any realms, it justs builds "tc filter" commands
# for you if you need to classify traffic this way.
#
# Realm is either a decimal number or a string referencing entry in
# /etc/iproute2/rt_realms (usually).
#
# Some examples:
#
# REALM=russia,internet
# selects traffic going from realm "russia" to realm "internet"
#
# REALM=freenet,
# selects traffic going from realm "freenet"
#
# REALM=10
# selects traffic going to realm 10
#
#
#
# MARK=<mark>
#
# These parameters make up "fw" filter rules that select traffic for
# each of the classes accoring to firewall "mark". Mark is a decimal
# number packets are tagged with if firewall rules say so. You can
# use multiple MARK fields per config.
#
#
# Note: Rules for different filter types can be combined. Attention must be
# paid to the priority of filter rules, which can be set below using
# PRIO_{RULE,MARK,REALM} variables.
#
### Time ranging parameters
#
# TIME=[<dow>,<dow>, ...,<dow>/]<from>-<till>;<rate>/<weight>[/<peak>]
# TIME=0,1,2,5/18:00-06:00;256Kbit/25Kbit
# TIME=60123/18:00-06:00;256Kbit/25Kbit
# TIME=18:00-06:00;256Kbit/25Kbit
#
# This parameter allows you to differentiate the class bandwidth
# throughout the day. You can specify multiple TIME parameters, if
# the times overlap, last match is taken. The fields <rate>, <weight>
# and <peak> correspond to parameters RATE, WEIGHT and PEAK (which
# is optional and applies to TBF leaf qdisc only).
#
# You can also specify days of week when the TIME rule applies. <dow>
# is numeric, 0 corresponds to sunday, 1 corresponds to monday, etc.
#
###
#
# Sample configuration file: cbq-1280.My_first_shaper
#
# --------------------------------------------------------------------------
# DEVICE=eth0,10Mbit,1Mbit
# RATE=128Kbit
# WEIGHT=10Kbit
# PRIO=5
# RULE=192.128.1.0/24
# --------------------------------------------------------------------------
#
# The configuration says that we will control traffic on 10Mbit ethernet
# device eth0 and the traffic going to network 192.168.1.0 will be
# processed with priority 5 and shaped to rate of 128Kbit.
#
# Note that you can control outgoing traffic only. If you want to control
# traffic in both directions, you must set up CBQ for both interfaces.
#
# Consider the following example:
#
# +---------+ 192.168.1.1
# BACKBONE -----eth0-| linux |-eth1------*-[client]
# +---------+
#
# Imagine you want to shape traffic from backbone to the client to 28Kbit
# and traffic in the opposite direction to 128Kbit. You need to setup CBQ
# on both eth0 and eth1 interfaces, thus you need two config files:
#
# cbq-028.backbone-client
# --------------------------------------------------------------------------
# DEVICE=eth1,10Mbit,1Mbit
# RATE=28Kbit
# WEIGHT=2Kbit
# PRIO=5
# RULE=192.168.1.1
# --------------------------------------------------------------------------
#
# cbq-128.client-backbone
# --------------------------------------------------------------------------
# DEVICE=eth0,10Mbit,1Mbit
# RATE=128Kbit
# WEIGHT=10Kbit
# PRIO=5
# RULE=192.168.1.1,
# --------------------------------------------------------------------------
#
# Pay attention to comma "," in the RULE field - it denotes source address!
#
# Enjoy.
#
#############################################################################
export LC_ALL=C
### Command locations
TC=/sbin/tc
IP=/sbin/ip
MP=/sbin/modprobe
### Default filter priorities (must be different)
PRIO_RULE_DEFAULT=${PRIO_RULE:-100}
PRIO_MARK_DEFAULT=${PRIO_MARK:-200}
PRIO_REALM_DEFAULT=${PRIO_REALM:-300}
### Default CBQ_PATH & CBQ_CACHE settings
CBQ_PATH=${CBQ_PATH:-/etc/sysconfig/cbq}
CBQ_CACHE=${CBQ_CACHE:-/var/cache/cbq.init}
### Uncomment to enable logfile for debugging
#CBQ_DEBUG="/var/run/cbq-$1"
### Modules to probe for. Uncomment the last CBQ_PROBE
### line if you have QoS support compiled into kernel
CBQ_PROBE="sch_cbq sch_tbf sch_sfq sch_prio"
CBQ_PROBE="$CBQ_PROBE cls_fw cls_u32 cls_route"
#CBQ_PROBE=""
### Keywords required for qdisc & class configuration
CBQ_WORDS="DEVICE|RATE|WEIGHT|PRIO|PARENT|LEAF|BOUNDED|ISOLATED"
CBQ_WORDS="$CBQ_WORDS|PRIO_MARK|PRIO_RULE|PRIO_REALM|BUFFER"
CBQ_WORDS="$CBQ_WORDS|LIMIT|PEAK|MTU|QUANTUM|PERTURB"
### Source AVPKT if it exists
[ -r /etc/sysconfig/cbq/avpkt ] && . /etc/sysconfig/cbq/avpkt
AVPKT=${AVPKT:-3000}
#############################################################################
############################# SUPPORT FUNCTIONS #############################
#############################################################################
### Get list of network devices
cbq_device_list () {
ip link show| sed -n "/^[0-9]/ \
{ s/^[0-9]\+: \([a-z0-9._]\+\)[:@].*/\1/; p; }"
} # cbq_device_list
### Remove root class from device $1
cbq_device_off () {
tc qdisc del dev $1 root 2> /dev/null
} # cbq_device_off
### Remove CBQ from all devices
cbq_off () {
for dev in `cbq_device_list`; do
cbq_device_off $dev
done
} # cbq_off
### Prefixed message
cbq_message () {
echo -e "**CBQ: $@"
} # cbq_message
### Failure message
cbq_failure () {
cbq_message "$@"
exit 1
} # cbq_failure
### Failure w/ cbq-off
cbq_fail_off () {
cbq_message "$@"
cbq_off
exit 1
} # cbq_fail_off
### Convert time to absolute value
cbq_time2abs () {
local min=${1##*:}; min=${min##0}
local hrs=${1%%:*}; hrs=${hrs##0}
echo $[hrs*60 + min]
} # cbq_time2abs
### Display CBQ setup
cbq_show () {
for dev in `cbq_device_list`; do
[ `tc qdisc show dev $dev| wc -l` -eq 0 ] && continue
echo -e "### $dev: queueing disciplines\n"
tc $1 qdisc show dev $dev; echo
[ `tc class show dev $dev| wc -l` -eq 0 ] && continue
echo -e "### $dev: traffic classes\n"
tc $1 class show dev $dev; echo
[ `tc filter show dev $dev| wc -l` -eq 0 ] && continue
echo -e "### $dev: filtering rules\n"
tc $1 filter show dev $dev; echo
done
} # cbq_show
### Check configuration and load DEVICES, DEVFIELDS and CLASSLIST from $1
cbq_init () {
### Get a list of configured classes
CLASSLIST=`find $1 -maxdepth 1 \( -type f -or -type l \) -name 'cbq-*' \
-not -name '*~' -printf "%f\n"| sort`
[ -z "$CLASSLIST" ] &&
cbq_failure "no configuration files found in $1!"
### Gather all DEVICE fields from $1/cbq-*
DEVFIELDS=`find $1 -maxdepth 1 \( -type f -or -type l \) -name 'cbq-*' \
-not -name '*~' | xargs sed -n 's/#.*//; \
s/[[:space:]]//g; /^DEVICE=[^,]*,[^,]*\(,[^,]*\)\?/ \
{ s/.*=//; p; }'| sort -u`
[ -z "$DEVFIELDS" ] &&
cbq_failure "no DEVICE field found in $1/cbq-*!"
### Check for different DEVICE fields for the same device
DEVICES=`echo "$DEVFIELDS"| sed 's/,.*//'| sort -u`
[ `echo "$DEVICES"| wc -l` -ne `echo "$DEVFIELDS"| wc -l` ] &&
cbq_failure "different DEVICE fields for single device!\n$DEVFIELDS"
} # cbq_init
### Load class configuration from $1/$2
cbq_load_class () {
CLASS=`echo $2| sed 's/^cbq-0*//; s/^\([0-9a-fA-F]\+\).*/\1/'`
CFILE=`sed -n 's/#.*//; s/[[:space:]]//g; /^[[:alnum:]_]\+=[[:alnum:].,:;/*@-_]\+$/ p' $1/$2`
### Check class number
IDVAL=`/usr/bin/printf "%d" 0x$CLASS 2> /dev/null`
[ $? -ne 0 -o $IDVAL -lt 2 -o $IDVAL -gt 65535 ] &&
cbq_fail_off "class ID of $2 must be in range <0002-FFFF>!"
### Set defaults & load class
RATE=""; WEIGHT=""; PARENT=""; PRIO=5
LEAF=tbf; BOUNDED=yes; ISOLATED=no
BUFFER=10Kb/8; LIMIT=15Kb; MTU=1500
PEAK=""; PERTURB=10; QUANTUM=""
PRIO_RULE=$PRIO_RULE_DEFAULT
PRIO_MARK=$PRIO_MARK_DEFAULT
PRIO_REALM=$PRIO_REALM_DEFAULT
eval `echo "$CFILE"| grep -E "^($CBQ_WORDS)="`
### Require RATE/WEIGHT
[ -z "$RATE" -o -z "$WEIGHT" ] &&
cbq_fail_off "missing RATE or WEIGHT in $2!"
### Class device
DEVICE=${DEVICE%%,*}
[ -z "$DEVICE" ] && cbq_fail_off "missing DEVICE field in $2!"
BANDWIDTH=`echo "$DEVFIELDS"| sed -n "/^$DEVICE,/ \
{ s/[^,]*,\([^,]*\).*/\1/; p; q; }"`
### Convert to "tc" options
PEAK=${PEAK:+peakrate $PEAK}
PERTURB=${PERTURB:+perturb $PERTURB}
QUANTUM=${QUANTUM:+quantum $QUANTUM}
[ "$BOUNDED" = "no" ] && BOUNDED="" || BOUNDED="bounded"
[ "$ISOLATED" = "yes" ] && ISOLATED="isolated" || ISOLATED=""
} # cbq_load_class
#############################################################################
#################################### INIT ###################################
#############################################################################
### Check for presence of ip-route2 in usual place
[ -x $TC -a -x $IP ] ||
cbq_failure "ip-route2 utilities not installed or executable!"
### ip/tc wrappers
if [ "$1" = "compile" ]; then
### no module probing
CBQ_PROBE=""
ip () {
$IP "$@"
} # ip
### echo-only version of "tc" command
tc () {
echo "$TC $@"
} # tc
elif [ -n "$CBQ_DEBUG" ]; then
echo -e "# `date`" > $CBQ_DEBUG
### Logging version of "ip" command
ip () {
echo -e "\n# ip $@" >> $CBQ_DEBUG
$IP "$@" 2>&1 | tee -a $CBQ_DEBUG
} # ip
### Logging version of "tc" command
tc () {
echo -e "\n# tc $@" >> $CBQ_DEBUG
$TC "$@" 2>&1 | tee -a $CBQ_DEBUG
} # tc
else
### Default wrappers
ip () {
$IP "$@"
} # ip
tc () {
$TC "$@"
} # tc
fi # ip/tc wrappers
case "$1" in
#############################################################################
############################### START/COMPILE ###############################
#############################################################################
start|compile)
### Probe QoS modules (start only)
for module in $CBQ_PROBE; do
$MP $module || cbq_failure "failed to load module $module"
done
### If we are in compile/nocache/logging mode, don't bother with cache
if [ "$1" != "compile" -a "$2" != "nocache" -a -z "$CBQ_DEBUG" ]; then
VALID=1
### validate the cache
[ "$2" = "invalidate" -o ! -f $CBQ_CACHE ] && VALID=0
if [ $VALID -eq 1 ]; then
[ `find $CBQ_PATH -maxdepth 1 -newer $CBQ_CACHE| \
wc -l` -gt 0 ] && VALID=0
fi
### compile the config if the cache is invalid
if [ $VALID -ne 1 ]; then
$0 compile > $CBQ_CACHE ||
cbq_fail_off "failed to compile CBQ configuration!"
fi
### run the cached commands
exec /bin/sh $CBQ_CACHE 2> /dev/null
fi
### Load DEVICES, DEVFIELDS and CLASSLIST
cbq_init $CBQ_PATH
### Setup root qdisc on all configured devices
for dev in $DEVICES; do
### Retrieve device bandwidth and, optionally, weight
DEVTEMP=`echo "$DEVFIELDS"| sed -n "/^$dev,/ { s/$dev,//; p; q; }"`
DEVBWDT=${DEVTEMP%%,*}; DEVWGHT=${DEVTEMP##*,}
[ "$DEVBWDT" = "$DEVWGHT" ] && DEVWGHT=""
### Device bandwidth is required
if [ -z "$DEVBWDT" ]; then
cbq_message "could not determine bandwidth for device $dev!"
cbq_failure "please set up the DEVICE fields properly!"
fi
### Check if the device is there
ip link show $dev &> /dev/null ||
cbq_fail_off "device $dev not found!"
### Remove old root qdisc from device
cbq_device_off $dev
### Setup root qdisc + class for device
tc qdisc add dev $dev root handle 1 cbq \
bandwidth $DEVBWDT avpkt $AVPKT cell 8
### Set weight of the root class if set
[ -n "$DEVWGHT" ] &&
tc class change dev $dev root cbq weight $DEVWGHT allot 1514
[ "$1" = "compile" ] && echo
done # dev
### Setup traffic classes
for classfile in $CLASSLIST; do
cbq_load_class $CBQ_PATH $classfile
### Create the class
tc class add dev $DEVICE parent 1:$PARENT classid 1:$CLASS cbq \
bandwidth $BANDWIDTH rate $RATE weight $WEIGHT prio $PRIO \
allot 1514 cell 8 maxburst 20 avpkt $AVPKT $BOUNDED $ISOLATED ||
cbq_fail_off "failed to add class $CLASS with parent $PARENT on $DEVICE!"
### Create leaf qdisc if set
if [ "$LEAF" = "tbf" ]; then
tc qdisc add dev $DEVICE parent 1:$CLASS handle $CLASS tbf \
rate $RATE buffer $BUFFER limit $LIMIT mtu $MTU $PEAK
elif [ "$LEAF" = "sfq" ]; then
tc qdisc add dev $DEVICE parent 1:$CLASS handle $CLASS sfq \
$PERTURB $QUANTUM
fi
### Create fw filter for MARK fields
for mark in `echo "$CFILE"| sed -n '/^MARK/ { s/.*=//; p; }'`; do
### Attach fw filter to root class
tc filter add dev $DEVICE parent 1:0 protocol ip \
prio $PRIO_MARK handle $mark fw classid 1:$CLASS
done ### mark
### Create route filter for REALM fields
for realm in `echo "$CFILE"| sed -n '/^REALM/ { s/.*=//; p; }'`; do
### Split realm into source & destination realms
SREALM=${realm%%,*}; DREALM=${realm##*,}
[ "$SREALM" = "$DREALM" ] && SREALM=""
### Convert asterisks to empty strings
SREALM=${SREALM#\*}; DREALM=${DREALM#\*}
### Attach route filter to the root class
tc filter add dev $DEVICE parent 1:0 protocol ip \
prio $PRIO_REALM route ${SREALM:+from $SREALM} \
${DREALM:+to $DREALM} classid 1:$CLASS
done ### realm
### Create u32 filter for RULE fields
for rule in `echo "$CFILE"| sed -n '/^RULE/ { s/.*=//; p; }'`; do
### Split rule into source & destination
SRC=${rule%%,*}; DST=${rule##*,}
[ "$SRC" = "$rule" ] && SRC=""
### Split destination into address, port & mask fields
DADDR=${DST%%:*}; DTEMP=${DST##*:}
[ "$DADDR" = "$DST" ] && DTEMP=""
DPORT=${DTEMP%%/*}; DMASK=${DTEMP##*/}
[ "$DPORT" = "$DTEMP" ] && DMASK="0xffff"
### Split up source (if specified)
SADDR=""; SPORT=""
if [ -n "$SRC" ]; then
SADDR=${SRC%%:*}; STEMP=${SRC##*:}
[ "$SADDR" = "$SRC" ] && STEMP=""
SPORT=${STEMP%%/*}; SMASK=${STEMP##*/}
[ "$SPORT" = "$STEMP" ] && SMASK="0xffff"
fi
### Convert asterisks to empty strings
SADDR=${SADDR#\*}; DADDR=${DADDR#\*}
### Compose u32 filter rules
u32_s="${SPORT:+match ip sport $SPORT $SMASK}"
u32_s="${SADDR:+match ip src $SADDR} $u32_s"
u32_d="${DPORT:+match ip dport $DPORT $DMASK}"
u32_d="${DADDR:+match ip dst $DADDR} $u32_d"
### Uncomment the following if you want to see parsed rules
#echo "$rule: $u32_s $u32_d"
### Attach u32 filter to the appropriate class
tc filter add dev $DEVICE parent 1:0 protocol ip \
prio $PRIO_RULE u32 $u32_s $u32_d classid 1:$CLASS
done ### rule
[ "$1" = "compile" ] && echo
done ### classfile
;;
#############################################################################
################################# TIME CHECK ################################
#############################################################################
timecheck)
### Get time + weekday
TIME_TMP=`date +%w/%k:%M`
TIME_DOW=${TIME_TMP%%/*}
TIME_NOW=${TIME_TMP##*/}
### Load DEVICES, DEVFIELDS and CLASSLIST
cbq_init $CBQ_PATH
### Run through all classes
for classfile in $CLASSLIST; do
### Gather all TIME rules from class config
TIMESET=`sed -n 's/#.*//; s/[[:space:]]//g; /^TIME/ { s/.*=//; p; }' \
$CBQ_PATH/$classfile`
[ -z "$TIMESET" ] && continue
MATCH=0; CHANGE=0
for timerule in $TIMESET; do
TIME_ABS=`cbq_time2abs $TIME_NOW`
### Split TIME rule to pieces
TIMESPEC=${timerule%%;*}; PARAMS=${timerule##*;}
WEEKDAYS=${TIMESPEC%%/*}; INTERVAL=${TIMESPEC##*/}
BEG_TIME=${INTERVAL%%-*}; END_TIME=${INTERVAL##*-}
### Check the day-of-week (if present)
[ "$WEEKDAYS" != "$INTERVAL" -a \
-n "${WEEKDAYS##*$TIME_DOW*}" ] && continue
### Compute interval boundaries
BEG_ABS=`cbq_time2abs $BEG_TIME`
END_ABS=`cbq_time2abs $END_TIME`
### Midnight wrap fixup
if [ $BEG_ABS -gt $END_ABS ]; then
[ $TIME_ABS -le $END_ABS ] &&
TIME_ABS=$[TIME_ABS + 24*60]
END_ABS=$[END_ABS + 24*60]
fi
### If the time matches, remember params and set MATCH flag
if [ $TIME_ABS -ge $BEG_ABS -a $TIME_ABS -lt $END_ABS ]; then
TMP_RATE=${PARAMS%%/*}; PARAMS=${PARAMS#*/}
TMP_WGHT=${PARAMS%%/*}; TMP_PEAK=${PARAMS##*/}
[ "$TMP_PEAK" = "$TMP_WGHT" ] && TMP_PEAK=""
TMP_PEAK=${TMP_PEAK:+peakrate $TMP_PEAK}
MATCH=1
fi
done ### timerule
cbq_load_class $CBQ_PATH $classfile
### Get current RATE of CBQ class
RATE_NOW=`tc class show dev $DEVICE| sed -n \
"/cbq 1:$CLASS / { s/.*rate //; s/ .*//; p; q; }"`
[ -z "$RATE_NOW" ] && continue
### Time interval matched
if [ $MATCH -ne 0 ]; then
### Check if there is any change in class RATE
if [ "$RATE_NOW" != "$TMP_RATE" ]; then
NEW_RATE="$TMP_RATE"
NEW_WGHT="$TMP_WGHT"
NEW_PEAK="$TMP_PEAK"
CHANGE=1
fi
### Match not found, reset to default RATE if necessary
elif [ "$RATE_NOW" != "$RATE" ]; then
NEW_WGHT="$WEIGHT"
NEW_RATE="$RATE"
NEW_PEAK="$PEAK"
CHANGE=1
fi
### If there are no changes, go for next class
[ $CHANGE -eq 0 ] && continue
### Replace CBQ class
tc class replace dev $DEVICE classid 1:$CLASS cbq \
bandwidth $BANDWIDTH rate $NEW_RATE weight $NEW_WGHT prio $PRIO \
allot 1514 cell 8 maxburst 20 avpkt $AVPKT $BOUNDED $ISOLATED
### Replace leaf qdisc (if any)
if [ "$LEAF" = "tbf" ]; then
tc qdisc replace dev $DEVICE handle $CLASS tbf \
rate $NEW_RATE buffer $BUFFER limit $LIMIT mtu $MTU $NEW_PEAK
fi
cbq_message "$TIME_NOW: class $CLASS on $DEVICE changed rate ($RATE_NOW -> $NEW_RATE)"
done ### class file
;;
#############################################################################
################################## THE REST #################################
#############################################################################
stop)
cbq_off
;;
list)
cbq_show
;;
stats)
cbq_show -s
;;
restart)
shift
$0 stop
$0 start "$@"
;;
*)
echo "Usage: `basename $0` {start|compile|stop|restart|timecheck|list|stats}"
esac

76
examples/cbqinit.eth1 Normal file
View File

@ -0,0 +1,76 @@
#! /bin/sh
TC=/home/root/tc
IP=/home/root/ip
DEVICE=eth1
BANDWIDTH="bandwidth 10Mbit"
# Attach CBQ on $DEVICE. It will have handle 1:.
# $BANDWIDTH is real $DEVICE bandwidth (10Mbit).
# avpkt is average packet size.
# mpu is minimal packet size.
$TC qdisc add dev $DEVICE root handle 1: cbq \
$BANDWIDTH avpkt 1000 mpu 64
# Create root class with classid 1:1. This step is not necessary.
# bandwidth is the same as on CBQ itself.
# rate == all the bandwidth
# allot is MTU + MAC header
# maxburst measure allowed class burstiness (please,read S.Floyd and VJ papers)
# est 1sec 8sec means, that kernel will evaluate average rate
# on this class with period 1sec and time constant 8sec.
# This rate is viewed with "tc -s class ls dev $DEVICE"
$TC class add dev $DEVICE parent 1:0 classid :1 est 1sec 8sec cbq \
$BANDWIDTH rate 10Mbit allot 1514 maxburst 50 avpkt 1000
# Bulk.
# New parameters are:
# weight, which is set to be proportional to
# "rate". It is not necessary, weight=1 will work as well.
# defmap and split say that best effort ttraffic, not classfied
# by another means will fall to this class.
$TC class add dev $DEVICE parent 1:1 classid :2 est 1sec 8sec cbq \
$BANDWIDTH rate 4Mbit allot 1514 weight 500Kbit \
prio 6 maxburst 50 avpkt 1000 split 1:0 defmap ff3d
# OPTIONAL.
# Attach "sfq" qdisc to this class, quantum is MTU, perturb
# gives period of hash function perturbation in seconds.
#
$TC qdisc add dev $DEVICE parent 1:2 sfq quantum 1514b perturb 15
# Interactive-burst class
$TC class add dev $DEVICE parent 1:1 classid :3 est 2sec 16sec cbq \
$BANDWIDTH rate 1Mbit allot 1514 weight 100Kbit \
prio 2 maxburst 100 avpkt 1000 split 1:0 defmap c0
$TC qdisc add dev $DEVICE parent 1:3 sfq quantum 1514b perturb 15
# Background.
$TC class add dev $DEVICE parent 1:1 classid :4 est 1sec 8sec cbq \
$BANDWIDTH rate 100Kbit allot 1514 weight 10Mbit \
prio 7 maxburst 10 avpkt 1000 split 1:0 defmap 2
$TC qdisc add dev $DEVICE parent 1:4 sfq quantum 1514b perturb 15
# Realtime class for RSVP
$TC class add dev $DEVICE parent 1:1 classid 1:7FFE cbq \
rate 5Mbit $BANDWIDTH allot 1514b avpkt 1000 \
maxburst 20
# Reclassified realtime traffic
#
# New element: split is not 1:0, but 1:7FFE. It means,
# that only real-time packets, which violated policing filters
# or exceeded reshaping buffers will fall to it.
$TC class add dev $DEVICE parent 1:7FFE classid 1:7FFF est 4sec 32sec cbq \
rate 1Mbit $BANDWIDTH allot 1514b avpkt 1000 weight 10Kbit \
prio 6 maxburst 10 split 1:7FFE defmap ffff

446
examples/dhcp-client-script Normal file
View File

@ -0,0 +1,446 @@
#!/bin/bash
#
# dhclient-script for Linux.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version
# 2 of the License, or (at your option) any later version.
#
# Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
#
# Probably, I did not understand, what this funny feature as "alias"
# means exactly. For now I suppose, that it is a static address, which
# we should install and preserve.
#
exec >> /var/log/DHS.log 2>&1
echo dhc-script $* reason=$reason
set | grep "^\(old_\|new_\|check_\)"
LOG () {
echo LOG $* ;
}
# convert 8bit mask to length
# arg: $1 = mask
#
Mask8ToLen() {
local l=0;
while [ $l -le 7 ]; do
if [ $[ ( 1 << $l ) + $1 ] -eq 256 ]; then
return $[ 8 - $l ]
fi
l=$[ $l + 1 ]
done
return 0;
}
# convert inet dotted quad mask to length
# arg: $1 = dotquad mask
#
MaskToLen() {
local masklen=0
local mask8=$1
case $1 in
0.0.0.0)
return 0;
;;
255.*.0.0)
masklen=8
mask8=${mask8#255.}
mask8=${mask8%.0.0}
;;
255.255.*.0)
masklen=16
mask8=${mask8#255.255.}
mask8=${mask8%.0}
;;
255.255.255.*)
masklen=24
mask8=${mask8#255.255.255.}
;;
*)
return 255
;;
esac
Mask8ToLen $mask8
return $[ $? + $masklen ]
}
# calculate ABC "natural" mask
# arg: $1 = dotquad address
#
ABCMask () {
local class;
class=${1%%.*}
if [ "$1" = "255.255.255.255" ]; then
echo $1
elif [ "$1" = "0.0.0.0" ]; then
echo $1
elif [ $class -ge 224 ]; then
echo 240.0.0.0
elif [ $class -ge 192 ]; then
echo 255.255.255.0
elif [ $class -ge 128 ]; then
echo 255.255.0.0
else
echo 255.0.0.0
fi
}
# calculate ABC "natural" mask length
# arg: $1 = dotquad address
#
ABCMaskLen () {
local class;
class=${1%%.*}
if [ "$1" = "255.255.255.255" ]; then
return 32
elif [ "$1" = "0.0.0.0" ]; then
return 0
elif [ $class -ge 224 ]; then
return 4;
elif [ $class -ge 192 ]; then
return 24;
elif [ $class -ge 128 ]; then
return 16;
else
return 8;
fi
}
# Delete IP address
# args: $1 = interface
# $2 = address
# $3 = mask
# $4 = broadcast
# $5 = label
#
DelINETAddr () {
local masklen=32
local addrid=$1
LOG DelINETAddr $*
if [ "$5" ]; then
addrid=$addrid:$5
fi
LOG ifconfig $addrid down
ifconfig $addrid down
}
# Add IP address
# args: $1 = interface
# $2 = address
# $3 = mask
# $4 = broadcast
# $5 = label
#
AddINETAddr () {
local mask_arg
local brd_arg
local addrid=$1
LOG AddINETAddr $*
if [ "$5" ]; then
addrid=$addrid:$5
fi
if [ "$3" ]; then
mask_arg="netmask $3"
fi
if [ "$4" ]; then
brd_arg="broadcast $4"
fi
LOG ifconfig $addrid $2 $mask_arg $brd_arg up
ifconfig $addrid $2 $mask_arg $brd_arg up
}
# Add default routes
# args: $1 = routers list
#
AddDefaultRoutes() {
local router
if [ "$1" ]; then
LOG AddDefaultRoutes $*
for router in $1; do
LOG route add default gw $router
route add default gw $router
done ;
fi
}
# Delete default routes
# args: $1 = routers list
#
DelDefaultRoutes() {
local router
if [ "$1" ]; then
LOG DelDefaultRoutes $*
for router in $1; do
LOG route del default gw $router
route del default gw $router
done
fi
}
# ping a host
# args: $1 = dotquad address of the host
#
PingNode() {
LOG PingNode $*
if ping -q -c 1 -w 2 $1 ; then
return 0;
fi
return 1;
}
# Check (and add route, if alive) default routers
# args: $1 = routers list
# returns: 0 if at least one router is alive.
#
CheckRouterList() {
local router
local succeed=1
LOG CheckRouterList $*
for router in $1; do
if PingNode $router ; then
succeed=0
route add default gw $router
fi
done
return $succeed
}
# Delete/create static routes.
# args: $1 = operation (del/add)
# $2 = routes list in format "dst1 nexthop1 dst2 ..."
#
# BEWARE: this feature of DHCP is obsolete, because does not
# support subnetting.
#
X-StaticRouteList() {
local op=$1
local lst="$2"
local masklen
LOG X-StaticRouteList $*
if [ "$lst" ]; then
set $lst
while [ $# -gt 1 ]; do
route $op -net $1 netmask `ABCMask "$1"` gw $2
shift; shift;
done
fi
}
# Create static routes.
# arg: $1 = routes list in format "dst1 nexthop1 dst2 ..."
#
AddStaticRouteList() {
LOG AddStaticRouteList $*
X-StaticRouteList add "$1"
}
# Delete static routes.
# arg: $1 = routes list in format "dst1 nexthop1 dst2 ..."
#
DelStaticRouteList() {
LOG DelStaticRouteList $*
X-StaticRouteList del "$1"
}
# Broadcast unsolicited ARP to update neighbours' caches.
# args: $1 = interface
# $2 = address
#
UnsolicitedARP() {
if [ -f /sbin/arping ]; then
/sbin/arping -A -c 1 -I "$1" "$2" &
(sleep 2 ; /sbin/arping -U -c 1 -I "$1" "$2" ) &
fi
}
# Duplicate address detection.
# args: $1 = interface
# $2 = test address
# returns: 0, if DAD succeeded.
DAD() {
if [ -f /sbin/arping ]; then
/sbin/arping -c 2 -w 3 -D -I "$1" "$2"
return $?
fi
return 0
}
# Setup resolver.
# args: NO
# domain and nameserver list are passed in global variables.
#
# NOTE: we try to be careful and not to break user supplied resolv.conf.
# The script mangles it, only if it has dhcp magic signature.
#
UpdateDNS() {
local nameserver
local idstring="#### Generated by DHCPCD"
LOG UpdateDNS $*
if [ "$new_domain_name" = "" -a "$new_domain_name_servers" = "" ]; then
return 0;
fi
echo $idstring > /etc/resolv.conf.dhcp
if [ "$new_domain_name" ]; then
echo search $new_domain_name >> /etc/resolv.conf.dhcp
fi
echo options ndots:1 >> /etc/resolv.conf.dhcp
if [ "$new_domain_name_servers" ]; then
for nameserver in $new_domain_name_servers; do
echo nameserver $nameserver >> /etc/resolv.conf.dhcp
done
else
echo nameserver 127.0.0.1 >> /etc/resolv.conf.dhcp
fi
if [ -f /etc/resolv.conf ]; then
if [ "`head -1 /etc/resolv.conf`" != "$idstring" ]; then
return 0
fi
if [ "$old_domain_name" = "$new_domain_name" -a
"$new_domain_name_servers" = "$old_domain_name_servers" ]; then
return 0
fi
fi
mv /etc/resolv.conf.dhcp /etc/resolv.conf
}
case $reason in
NBI)
exit 1
;;
MEDIUM)
exit 0
;;
PREINIT)
ifconfig $interface:dhcp down
ifconfig $interface:dhcp1 down
if [ -d /proc/sys/net/ipv4/conf/$interface ]; then
ifconfig $interface:dhcp 10.10.10.10 netmask 255.255.255.255
ifconfig $interface:dhcp down
if [ -d /proc/sys/net/ipv4/conf/$interface ]; then
LOG The interface $interface already configured.
fi
fi
ifconfig $interface:dhcp up
exit 0
;;
ARPSEND)
exit 0
;;
ARPCHECK)
if DAD "$interface" "$check_ip_address" ; then
exit 0
fi
exit 1
;;
BOUND|RENEW|REBIND|REBOOT)
if [ "$old_ip_address" -a "$alias_ip_address" -a \
"$alias_ip_address" != "$old_ip_address" ]; then
DelINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
fi
if [ "$old_ip_address" -a "$old_ip_address" != "$new_ip_address" ]; then
DelINETAddr "$interface" "$old_ip_address" "$old_subnet_mask" "$old_broadcast_address" dhcp
DelDefaultRoutes "$old_routers"
DelStaticRouteList "$old_static_routes"
fi
if [ "$old_ip_address" = "" -o "$old_ip_address" != "$new_ip_address" -o \
"$reason" = "BOUND" -o "$reason" = "REBOOT" ]; then
AddINETAddr "$interface" "$new_ip_address" "$new_subnet_mask" "$new_broadcast_address" dhcp
AddStaticRouteList "$new_static_routes"
AddDefaultRoutes "$new_routers"
UnsolicitedARP "$interface" "$new_ip_address"
fi
if [ "$new_ip_address" != "$alias_ip_address" -a "$alias_ip_address" ]; then
AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
fi
UpdateDNS
exit 0
;;
EXPIRE|FAIL)
if [ "$alias_ip_address" ]; then
DelINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
fi
if [ "$old_ip_address" ]; then
DelINETAddr "$interface" "$old_ip_address" "$old_subnet_mask" "$old_broadcast_address" dhcp
DelDefaultRoutes "$old_routers"
DelStaticRouteList "$old_static_routes"
fi
if [ "$alias_ip_address" ]; then
AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
fi
exit 0
;;
TIMEOUT)
if [ "$alias_ip_address" ]; then
DelINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
fi
# Seems, <null address> means, that no more old leases found.
# Or does it mean bug in dhcpcd? 8) Fail for now.
if [ "$new_ip_address" = "<null address>" ]; then
if [ "$old_ip_address" ]; then
DelINETAddr "$interface" "$old_ip_address" "$old_subnet_mask" "$old_broadcast_address" dhcp
fi
if [ "$alias_ip_address" ]; then
AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
fi
exit 1
fi
if DAD "$interface" "$new_ip_address" ; then
AddINETAddr "$interface" "$new_ip_address" "$new_subnet_mask" "$new_broadcast_address" dhcp
UnsolicitedARP "$interface" "$new_ip_address"
if [ "$alias_ip_address" -a "$alias_ip_address" != "$new_ip_address" ]; then
AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
UnsolicitedARP "$interface" "$alias_ip_address"
fi
if CheckRouterList "$new_routers" ; then
AddStaticRouteList "$new_static_routes"
UpdateDNS
exit 0
fi
fi
DelINETAddr "$interface" "$new_ip_address" "$new_subnet_mask" "$new_broadcast_address" dhcp
DelDefaultRoutes "$old_routers"
DelStaticRouteList "$old_static_routes"
if [ "$alias_ip_address" ]; then
AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1
fi
exit 1
;;
esac
exit 0

68
examples/diffserv/Edge1 Normal file
View File

@ -0,0 +1,68 @@
#! /bin/sh -x
#
# sample script on using the ingress capabilities
# This script just tags on the ingress interfac using Ipchains
# the result is used for fast classification and re-marking
# on the egress interface
#
#path to various utilities;
#change to reflect yours.
#
IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
TC=$IPROUTE/tc/tc
IP=$IPROUTE/ip/ip
IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
INDEV=eth2
EGDEV="dev eth1"
#
# tag all incoming packets from host 10.2.0.24 to value 1
# tag all incoming packets from host 10.2.0.3 to value 2
# tag the rest of incoming packets from subnet 10.2.0.0/24 to value 3
#These values are used in the egress
#
############################################################
$IPCHAINS -A input -s 10.2.0.4/24 -m 3
$IPCHAINS -A input -i $INDEV -s 10.2.0.24 -m 1
$IPCHAINS -A input -i $INDEV -s 10.2.0.3 -m 2
######################## Egress side ########################
# attach a dsmarker
#
$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64 set_tc_index
#
# values of the DSCP to change depending on the class
#
#becomes EF
$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
value 0xb8
#becomes AF11
$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
value 0x28
#becomes AF21
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x48
#
#
# The class mapping
#
$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 1 fw classid 1:1
$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 2 fw classid 1:2
$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 3 fw classid 1:3
#
#
echo "---- qdisc parameters Ingress ----------"
$TC qdisc ls dev $INDEV
echo "---- Class parameters Ingress ----------"
$TC class ls dev $INDEV
echo "---- filter parameters Ingress ----------"
$TC filter ls dev $INDEV parent 1:0
echo "---- qdisc parameters Egress ----------"
$TC qdisc ls $EGDEV
echo "---- Class parameters Egress ----------"
$TC class ls $EGDEV
echo "---- filter parameters Egress ----------"
$TC filter ls $EGDEV parent 1:0

87
examples/diffserv/Edge2 Normal file
View File

@ -0,0 +1,87 @@
#! /bin/sh -x
#
# sample script on using the ingress capabilities
# This script tags the fwmark on the ingress interface using IPchains
# the result is used first for policing on the Ingress interface then
# for fast classification and re-marking
# on the egress interface
#
#path to various utilities;
#change to reflect yours.
#
IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
TC=$IPROUTE/tc/tc
IP=$IPROUTE/ip/ip
IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
INDEV=eth2
EGDEV="dev eth1"
#
# tag all incoming packets from host 10.2.0.24 to value 1
# tag all incoming packets from host 10.2.0.3 to value 2
# tag the rest of incoming packets from subnet 10.2.0.0/24 to value 3
#These values are used in the egress
############################################################
$IPCHAINS -A input -s 10.2.0.0/24 -m 3
$IPCHAINS -A input -i $INDEV -s 10.2.0.24 -m 1
$IPCHAINS -A input -i $INDEV -s 10.2.0.3 -m 2
############################################################
#
# install the ingress qdisc on the ingress interface
############################################################
$TC qdisc add dev $INDEV handle ffff: ingress
############################################################
#
# attach a fw classifier to the ingress which polices anything marked
# by ipchains to tag value 3 (The rest of the subnet packets -- not
# tag 1 or 2) to not go beyond 1.5Mbps
# Allow up to at least 60 packets to burst (assuming maximum packet
# size of # 1.5 KB) in the long run and upto about 6 packets in the
# shot run
############################################################
$TC filter add dev $INDEV parent ffff: protocol ip prio 50 handle 3 fw \
police rate 1500kbit burst 90k mtu 9k drop flowid :1
############################################################
######################## Egress side ########################
# attach a dsmarker
#
$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
#
# values of the DSCP to change depending on the class
#
$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
value 0xb8
$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
value 0x28
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x48
#
#
# The class mapping
#
$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 1 fw classid 1:1
$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 2 fw classid 1:2
$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 3 fw classid 1:3
#
#
echo "---- qdisc parameters Ingress ----------"
$TC qdisc ls dev $INDEV
echo "---- Class parameters Ingress ----------"
$TC class ls dev $INDEV
echo "---- filter parameters Ingress ----------"
$TC filter ls dev $INDEV parent ffff:
echo "---- qdisc parameters Egress ----------"
$TC qdisc ls $EGDEV
echo "---- Class parameters Egress ----------"
$TC class ls $EGDEV
echo "---- filter parameters Egress ----------"
$TC filter ls $EGDEV parent 1:0
#
#deleting the ingress qdisc
#$TC qdisc del $DEV ingress

View File

@ -0,0 +1,170 @@
#! /bin/sh -x
#
# sample script on using the ingress capabilities using u32 classifier
# This script tags tcindex based on metering on the ingress
# interface the result is used for fast classification and re-marking
# on the egress interface
# This is an example of a color aware mode marker with PIR configured
# based on draft-wahjak-mcm-00.txt (section 3.1)
#
# The colors are defined using the Diffserv Fields
#path to various utilities;
#change to reflect yours.
#
IPROUTE=/usr/src/iproute2-current
TC=$IPROUTE/tc/tc
IP=$IPROUTE/ip/ip
INDEV=eth0
EGDEV="dev eth1"
CIR1=1500kbit
CIR2=1000kbit
#The CBS is about 60 MTU sized packets
CBS1=90k
CBS2=90k
############################################################
#
# install the ingress qdisc on the ingress interface
$TC qdisc add dev $INDEV handle ffff: ingress
############################################################
#
# Create u32 filters
$TC filter add dev $INDEV parent ffff: protocol ip prio 4 handle 1: u32 \
divisor 1
############################################################
# The meters: Note that we have shared meters in this case as identified
# by the index parameter
meter1=" police index 1 rate $CIR1 burst $CBS1 "
meter2=" police index 2 rate $CIR2 burst $CBS1 "
meter3=" police index 3 rate $CIR2 burst $CBS2 "
meter4=" police index 4 rate $CIR1 burst $CBS2 "
meter5=" police index 5 rate $CIR1 burst $CBS2 "
# All packets are marked with a tcindex value which is used on the egress
# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE
# *********************** AF41 ***************************
#AF41 (DSCP 0x22) is passed on with a tcindex value 1
#if it doesnt exceed its CIR/CBS
#policer 1 is used.
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 4 u32 \
match ip tos 0x88 0xfc \
$meter1 \
continue flowid :1
#
# if it exceeds the above but not the extra rate/burst below, it gets a
# tcindex value of 2
# policer 2 is used
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \
match ip tos 0x88 0xfc \
$meter2 \
continue flowid :2
#
# if it exceeds the above but not the rule below, it gets a tcindex value
# of 3 (policer 3)
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
match ip tos 0x88 0xfc \
$meter3 \
drop flowid :3
#
# *********************** AF42 ***************************
#AF42 (DSCP 0x24) from is passed on with a tcindex value 2
#if it doesnt exceed its CIR/CBS
#policer 2 is used. Note that this is shared with the AF41
#
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \
match ip tos 0x90 0xfc \
$meter2 \
continue flowid :2
#
# if it exceeds the above but not the rule below, it gets a tcindex value
# of 3 (policer 3)
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
match ip tos 0x90 0xfc \
$meter3 \
drop flowid :3
#
# *********************** AF43 ***************************
#
#AF43 (DSCP 0x26) from is passed on with a tcindex value 3
#if it doesnt exceed its CIR/CBS
#policer 3 is used. Note that this is shared with the AF41 and AF42
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
match ip tos 0x98 0xfc \
$meter3 \
drop flowid :3
#
# *********************** BE ***************************
#
# Anything else (not from the AF4*) gets discarded if it
# exceeds 1Mbps and by default goes to BE if it doesnt
# Note that the BE class is also used by the AF4* in the worst
# case
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 7 u32 \
match ip src 0/0\
$meter4 \
drop flowid :4
######################## Egress side ########################
# attach a dsmarker
#
$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
#
# values of the DSCP to change depending on the class
#note that the ECN bits are masked out
#
#AF41 (0x88 is 0x22 shifted to the right by two bits)
#
$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
value 0x88
#AF42
$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
value 0x90
#AF43
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x98
#BE
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x0
#
#
# The class mapping
#
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 1 tcindex classid 1:1
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 2 tcindex classid 1:2
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 3 tcindex classid 1:3
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 4 tcindex classid 1:4
#
#
echo "---- qdisc parameters Ingress ----------"
$TC qdisc ls dev $INDEV
echo "---- Class parameters Ingress ----------"
$TC class ls dev $INDEV
echo "---- filter parameters Ingress ----------"
$TC filter ls dev $INDEV parent ffff:
echo "---- qdisc parameters Egress ----------"
$TC qdisc ls $EGDEV
echo "---- Class parameters Egress ----------"
$TC class ls $EGDEV
echo "---- filter parameters Egress ----------"
$TC filter ls $EGDEV parent 1:0
#
#deleting the ingress qdisc
#$TC qdisc del $INDEV ingress

View File

@ -0,0 +1,132 @@
#! /bin/sh -x
#
# sample script on using the ingress capabilities
# This script fwmark tags(IPchains) based on metering on the ingress
# interface the result is used for fast classification and re-marking
# on the egress interface
# This is an example of a color blind mode marker with no PIR configured
# based on draft-wahjak-mcm-00.txt (section 3.1)
#
#path to various utilities;
#change to reflect yours.
#
IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
TC=$IPROUTE/tc/tc
IP=$IPROUTE/ip/ip
IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
INDEV=eth2
EGDEV="dev eth1"
CIR1=1500kbit
CIR2=1000kbit
#The CBS is about 60 MTU sized packets
CBS1=90k
CBS2=90k
meter1="police rate $CIR1 burst $CBS1 "
meter2="police rate $CIR1 burst $CBS2 "
meter3="police rate $CIR2 burst $CBS1 "
meter4="police rate $CIR2 burst $CBS2 "
meter5="police rate $CIR2 burst $CBS2 "
#
# tag the rest of incoming packets from subnet 10.2.0.0/24 to fw value 1
# tag all incoming packets from any other subnet to fw tag 2
############################################################
$IPCHAINS -A input -i $INDEV -s 0/0 -m 2
$IPCHAINS -A input -i $INDEV -s 10.2.0.0/24 -m 1
#
############################################################
# install the ingress qdisc on the ingress interface
$TC qdisc add dev $INDEV handle ffff: ingress
#
############################################################
# All packets are marked with a tcindex value which is used on the egress
# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE
#
############################################################
#
# anything with fw tag of 1 is passed on with a tcindex value 1
#if it doesnt exceed its allocated rate (CIR/CBS)
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 4 handle 1 fw \
$meter1 \
continue flowid 4:1
#
# if it exceeds the above but not the extra rate/burst below, it gets a
#tcindex value of 2
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 5 handle 1 fw \
$meter2 \
continue flowid 4:2
#
# if it exceeds the above but not the rule below, it gets a tcindex value
# of 3
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 6 handle 1 fw \
$meter3 \
drop flowid 4:3
#
# Anything else (not from the subnet 10.2.0.24/24) gets discarded if it
# exceeds 1Mbps and by default goes to BE if it doesnt
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 6 handle 2 fw \
$meter5 \
drop flowid 4:4
######################## Egress side ########################
# attach a dsmarker
#
$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
#
# values of the DSCP to change depending on the class
#note that the ECN bits are masked out
#
#AF41 (0x88 is 0x22 shifted to the right by two bits)
#
$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
value 0x88
#AF42
$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
value 0x90
#AF43
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x98
#BE
$TC class change $EGDEV classid 1:4 dsmark mask 0x3 \
value 0x0
#
#
# The class mapping (using tcindex; could easily have
# replaced it with the fw classifier instead)
#
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 1 tcindex classid 1:1
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 2 tcindex classid 1:2
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 3 tcindex classid 1:3
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 4 tcindex classid 1:4
#
#
echo "---- qdisc parameters Ingress ----------"
$TC qdisc ls dev $INDEV
echo "---- Class parameters Ingress ----------"
$TC class ls dev $INDEV
echo "---- filter parameters Ingress ----------"
$TC filter ls dev $INDEV parent ffff:
echo "---- qdisc parameters Egress ----------"
$TC qdisc ls $EGDEV
echo "---- Class parameters Egress ----------"
$TC class ls $EGDEV
echo "---- filter parameters Egress ----------"
$TC filter ls $EGDEV parent 1:0
#
#deleting the ingress qdisc
#$TC qdisc del $INDEV ingress

View File

@ -0,0 +1,198 @@
#! /bin/sh -x
#
# sample script on using the ingress capabilities using u32 classifier
# This script tags tcindex based on metering on the ingress
# interface the result is used for fast classification and re-marking
# on the egress interface
# This is an example of a color aware mode marker with PIR configured
# based on draft-wahjak-mcm-00.txt (section 3.2)
#
# The colors are defined using the Diffserv Fields
#path to various utilities;
#change to reflect yours.
#
IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
TC=$IPROUTE/tc/tc
IP=$IPROUTE/ip/ip
IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
INDEV=eth2
EGDEV="dev eth1"
CIR1=1000kbit
CIR2=500kbit
# the PIR is what is in excess of the CIR
PIR1=1000kbit
PIR2=500kbit
#The CBS is about 60 MTU sized packets
CBS1=90k
CBS2=90k
#the EBS is about 20 max sized packets
EBS1=30k
EBS2=30k
# The meters: Note that we have shared meters in this case as identified
# by the index parameter
meter1=" police index 1 rate $CIR1 burst $CBS1 "
meter1a=" police index 2 rate $PIR1 burst $EBS1 "
meter2=" police index 3 rate $CIR2 burst $CBS1 "
meter2a=" police index 4 rate $PIR2 burst $EBS1 "
meter3=" police index 5 rate $CIR2 burst $CBS2 "
meter3a=" police index 6 rate $PIR2 burst $EBS2 "
meter4=" police index 7 rate $CIR1 burst $CBS2 "
############################################################
#
# install the ingress qdisc on the ingress interface
$TC qdisc add dev $INDEV handle ffff: ingress
############################################################
#
# All packets are marked with a tcindex value which is used on the egress
# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE
#
# *********************** AF41 ***************************
#AF41 (DSCP 0x22) from is passed on with a tcindex value 1
#if it doesnt exceed its CIR/CBS + PIR/EBS
#policer 1 is used.
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 1 u32 \
match ip tos 0x88 0xfc \
$meter1 \
continue flowid :1
$TC filter add dev $INDEV parent ffff: protocol ip prio 2 u32 \
match ip tos 0x88 0xfc \
$meter1a \
continue flowid :1
#
# if it exceeds the above but not the extra rate/burst below, it gets a
# tcindex value of 2
# policer 2 is used
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 3 u32 \
match ip tos 0x88 0xfc \
$meter2 \
continue flowid :2
$TC filter add dev $INDEV parent ffff: protocol ip prio 4 u32 \
match ip tos 0x88 0xfc \
$meter2a \
continue flowid :2
#
# if it exceeds the above but not the rule below, it gets a tcindex value
# of 3 (policer 3)
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \
match ip tos 0x88 0xfc \
$meter3 \
continue flowid :3
$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
match ip tos 0x88 0xfc \
$meter3a \
drop flowid :3
#
# *********************** AF42 ***************************
#AF42 (DSCP 0x24) from is passed on with a tcindex value 2
#if it doesnt exceed its CIR/CBS + PIR/EBS
#policer 2 is used. Note that this is shared with the AF41
#
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 8 u32 \
match ip tos 0x90 0xfc \
$meter2 \
continue flowid :2
$TC filter add dev $INDEV parent ffff: protocol ip prio 9 u32 \
match ip tos 0x90 0xfc \
$meter2a \
continue flowid :2
#
# if it exceeds the above but not the rule below, it gets a tcindex value
# of 3 (policer 3)
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 10 u32 \
match ip tos 0x90 0xfc \
$meter3 \
continue flowid :3
$TC filter add dev $INDEV parent ffff: protocol ip prio 11 u32 \
match ip tos 0x90 0xfc \
$meter3a \
drop flowid :3
#
# *********************** AF43 ***************************
#
#AF43 (DSCP 0x26) from is passed on with a tcindex value 3
#if it doesnt exceed its CIR/CBS + PIR/EBS
#policer 3 is used. Note that this is shared with the AF41 and AF42
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 13 u32 \
match ip tos 0x98 0xfc \
$meter3 \
continue flowid :3
$TC filter add dev $INDEV parent ffff: protocol ip prio 14 u32 \
match ip tos 0x98 0xfc \
$meter3a \
drop flowid :3
#
## *********************** BE ***************************
##
## Anything else (not from the AF4*) gets discarded if it
## exceeds 1Mbps and by default goes to BE if it doesnt
## Note that the BE class is also used by the AF4* in the worst
## case
##
$TC filter add dev $INDEV parent ffff: protocol ip prio 16 u32 \
match ip src 0/0\
$meter4 \
drop flowid :4
######################## Egress side ########################
# attach a dsmarker
#
$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
#
# values of the DSCP to change depending on the class
#note that the ECN bits are masked out
#
#AF41 (0x88 is 0x22 shifted to the right by two bits)
#
$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
value 0x88
#AF42
$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
value 0x90
#AF43
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x98
#BE
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x0
#
#
# The class mapping
#
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 1 tcindex classid 1:1
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 2 tcindex classid 1:2
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 3 tcindex classid 1:3
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 4 tcindex classid 1:4
#
#
echo "---- qdisc parameters Ingress ----------"
$TC qdisc ls dev $INDEV
echo "---- Class parameters Ingress ----------"
$TC class ls dev $INDEV
echo "---- filter parameters Ingress ----------"
$TC filter ls dev $INDEV parent ffff:
echo "---- qdisc parameters Egress ----------"
$TC qdisc ls $EGDEV
echo "---- Class parameters Egress ----------"
$TC class ls $EGDEV
echo "---- filter parameters Egress ----------"
$TC filter ls $EGDEV parent 1:0
#
#deleting the ingress qdisc
#$TC qdisc del $INDEV ingress

View File

@ -0,0 +1,144 @@
#! /bin/sh -x
#
# sample script on using the ingress capabilities
# This script fwmark tags(IPchains) based on metering on the ingress
# interface the result is used for fast classification and re-marking
# on the egress interface
# This is an example of a color blind mode marker with no PIR configured
# based on draft-wahjak-mcm-00.txt (section 3.1)
#
#path to various utilities;
#change to reflect yours.
#
IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
TC=$IPROUTE/tc/tc
IP=$IPROUTE/ip/ip
IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
INDEV=eth2
EGDEV="dev eth1"
CIR1=1500kbit
CIR2=500kbit
#The CBS is about 60 MTU sized packets
CBS1=90k
CBS2=90k
meter1="police rate $CIR1 burst $CBS1 "
meter1a="police rate $CIR2 burst $CBS1 "
meter2="police rate $CIR1 burst $CBS2 "
meter2a="police rate $CIR2 burst $CBS2 "
meter3="police rate $CIR2 burst $CBS1 "
meter3a="police rate $CIR2 burst $CBS1 "
meter4="police rate $CIR2 burst $CBS2 "
meter5="police rate $CIR1 burst $CBS2 "
#
# tag the rest of incoming packets from subnet 10.2.0.0/24 to fw value 1
# tag all incoming packets from any other subnet to fw tag 2
############################################################
$IPCHAINS -A input -i $INDEV -s 0/0 -m 2
$IPCHAINS -A input -i $INDEV -s 10.2.0.0/24 -m 1
#
############################################################
# install the ingress qdisc on the ingress interface
$TC qdisc add dev $INDEV handle ffff: ingress
#
############################################################
# All packets are marked with a tcindex value which is used on the egress
# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE
#
############################################################
#
# anything with fw tag of 1 is passed on with a tcindex value 1
#if it doesnt exceed its allocated rate (CIR/CBS)
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 1 handle 1 fw \
$meter1 \
continue flowid 4:1
$TC filter add dev $INDEV parent ffff: protocol ip prio 2 handle 1 fw \
$meter1a \
continue flowid 4:1
#
# if it exceeds the above but not the extra rate/burst below, it gets a
#tcindex value of 2
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 3 handle 1 fw \
$meter2 \
continue flowid 4:2
$TC filter add dev $INDEV parent ffff: protocol ip prio 4 handle 1 fw \
$meter2a \
continue flowid 4:2
#
# if it exceeds the above but not the rule below, it gets a tcindex value
# of 3
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 5 handle 1 fw \
$meter3 \
continue flowid 4:3
$TC filter add dev $INDEV parent ffff: protocol ip prio 6 handle 1 fw \
$meter3a \
drop flowid 4:3
#
# Anything else (not from the subnet 10.2.0.24/24) gets discarded if it
# exceeds 1Mbps and by default goes to BE if it doesnt
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 7 handle 2 fw \
$meter5 \
drop flowid 4:4
######################## Egress side ########################
# attach a dsmarker
#
$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
#
# values of the DSCP to change depending on the class
#note that the ECN bits are masked out
#
#AF41 (0x88 is 0x22 shifted to the right by two bits)
#
$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
value 0x88
#AF42
$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
value 0x90
#AF43
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x98
#BE
$TC class change $EGDEV classid 1:4 dsmark mask 0x3 \
value 0x0
#
#
# The class mapping (using tcindex; could easily have
# replaced it with the fw classifier instead)
#
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 1 tcindex classid 1:1
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 2 tcindex classid 1:2
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 3 tcindex classid 1:3
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 4 tcindex classid 1:4
#
#
echo "---- qdisc parameters Ingress ----------"
$TC qdisc ls dev $INDEV
echo "---- Class parameters Ingress ----------"
$TC class ls dev $INDEV
echo "---- filter parameters Ingress ----------"
$TC filter ls dev $INDEV parent ffff:
echo "---- qdisc parameters Egress ----------"
$TC qdisc ls $EGDEV
echo "---- Class parameters Egress ----------"
$TC class ls $EGDEV
echo "---- filter parameters Egress ----------"
$TC filter ls $EGDEV parent 1:0
#
#deleting the ingress qdisc
#$TC qdisc del $INDEV ingress

View File

@ -0,0 +1,145 @@
#! /bin/sh
#
# sample script on using the ingress capabilities using u32 classifier
# This script tags tcindex based on metering on the ingress
# interface the result is used for fast classification and re-marking
# on the egress interface
# This is an example of a color blind mode marker with PIR configured
# based on draft-wahjak-mcm-00.txt (section 3.2)
#
#path to various utilities;
#change to reflect yours.
#
IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
TC=$IPROUTE/tc/tc
IP=$IPROUTE/ip/ip
INDEV=eth2
EGDEV="dev eth1"
CIR1=1000kbit
CIR2=1000kbit
# The PIR is the excess (in addition to the CIR i.e if always
# going to the PIR --> average rate is CIR+PIR)
PIR1=1000kbit
PIR2=500kbit
#The CBS is about 60 MTU sized packets
CBS1=90k
CBS2=90k
#the EBS is about 10 max sized packets
EBS1=15k
EBS2=15k
# The meters
meter1=" police rate $CIR1 burst $CBS1 "
meter1a=" police rate $PIR1 burst $EBS1 "
meter2=" police rate $CIR2 burst $CBS1 "
meter2a="police rate $PIR2 burst $CBS1 "
meter3=" police rate $CIR2 burst $CBS2 "
meter3a=" police rate $PIR2 burst $EBS2 "
meter4=" police rate $CIR1 burst $CBS2 "
meter5=" police rate $CIR1 burst $CBS2 "
# install the ingress qdisc on the ingress interface
############################################################
$TC qdisc add dev $INDEV handle ffff: ingress
############################################################
#
############################################################
# All packets are marked with a tcindex value which is used on the egress
# NOTE: tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE
#
#anything from subnet 10.2.0.2/24 is passed on with a tcindex value 1
#if it doesnt exceed its CIR/CBS + PIR/EBS
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 1 u32 \
match ip src 10.2.0.0/24 $meter1 \
continue flowid :1
$TC filter add dev $INDEV parent ffff: protocol ip prio 2 u32 \
match ip src 10.2.0.0/24 $meter1a \
continue flowid :1
#
# if it exceeds the above but not the extra rate/burst below, it gets a
#tcindex value of 2
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 3 u32 \
match ip src 10.2.0.0/24 $meter2 \
continue flowid :2
$TC filter add dev $INDEV parent ffff: protocol ip prio 4 u32 \
match ip src 10.2.0.0/24 $meter2a \
continue flowid :2
#
# if it exceeds the above but not the rule below, it gets a tcindex value
# of 3
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \
match ip src 10.2.0.0/24 $meter3 \
continue flowid :3
$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
match ip src 10.2.0.0/24 $meter3a \
drop flowid :3
#
#
# Anything else (not from the subnet 10.2.0.24/24) gets discarded if it
# exceeds 1Mbps and by default goes to BE if it doesnt
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 7 u32 \
match ip src 0/0 $meter5 \
drop flowid :4
######################## Egress side ########################
# attach a dsmarker
#
$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
#
# values of the DSCP to change depending on the class
#note that the ECN bits are masked out
#
#AF41 (0x88 is 0x22 shifted to the right by two bits)
#
$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
value 0x88
#AF42
$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
value 0x90
#AF43
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x98
#BE
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x0
#
#
# The class mapping
#
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 1 tcindex classid 1:1
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 2 tcindex classid 1:2
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 3 tcindex classid 1:3
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 4 tcindex classid 1:4
#
#
echo "---- qdisc parameters Ingress ----------"
$TC qdisc ls dev $INDEV
echo "---- Class parameters Ingress ----------"
$TC class ls dev $INDEV
echo "---- filter parameters Ingress ----------"
$TC filter ls dev $INDEV parent ffff:
echo "---- qdisc parameters Egress ----------"
$TC qdisc ls $EGDEV
echo "---- Class parameters Egress ----------"
$TC class ls $EGDEV
echo "---- filter parameters Egress ----------"
$TC filter ls $EGDEV parent 1:0
#
#deleting the ingress qdisc
#$TC qdisc del $INDEV ingress

98
examples/diffserv/README Normal file
View File

@ -0,0 +1,98 @@
Note all these are mere examples which can be customized to your needs
AFCBQ
-----
AF PHB built using CBQ, DSMARK,GRED (default in GRIO mode) ,RED for BE
and the tcindex classifier with some algorithmic mapping
EFCBQ
-----
EF PHB built using CBQ (for rate control and prioritization),
DSMARK( to remark DSCPs), tcindex classifier and RED for the BE
traffic.
EFPRIO
------
EF PHB using the PRIO scheduler, Token Bucket to rate control EF,
tcindex classifier, DSMARK to remark, and RED for the BE traffic
EDGE scripts
==============
CB-3(1|2)-(u32/chains)
======================
The major differences are that the classifier is u32 on -u32 extension
and IPchains on the chains extension. CB stands for color Blind
and 31 is for the mode where only a CIR and CBS are defined whereas
32 stands for a mode where a CIR/CBS + PIR/EBS are defined.
Color Blind (CB)
==========-----=
We look at one special subnet that we are interested in for simplicty
reasons to demonstrate the capability. We send the packets from that
subnet to AF4*, BE or end up dropping depending on the metering results.
The algorithm overview is as follows:
*classify:
**case: subnet X
----------------
if !exceed meter1 tag as AF41
else
if !exceed meter2 tag as AF42
else
if !exceed meter 3 tag as AF43
else
drop
default case: Any other subnet
-------------------------------
if !exceed meter 5 tag as AF43
else
drop
One Egress side change the DSCPs of the packets to reflect AF4* and BE
based on the tags from the ingress.
-------------------------------------------------------------
Color Aware
===========
Define some meters with + policing and give them IDs eg
meter1=police index 1 rate $CIR1 burst $CBS1
meter2=police index 2 rate $CIR2 burst $CBS2 etc
General overview:
classify based on the DSCPs and use the policer ids to decide tagging
*classify on ingress:
switch (dscp) {
case AF41: /* tos&0xfc == 0x88 */
if (!exceed meter1) break;
case AF42: /* tos&0xfc == 0x90 */
if (!exceed meter2) {
tag as AF42;
break;
}
case AF43: /* tos&0xfc == 0x98 */
if (!exceed meter3) {
tag as AF43;
break;
} else
drop;
default:
if (!exceed meter4) tag as BE;
else drop;
}
On the Egress side mark the proper AF tags

105
examples/diffserv/afcbq Normal file
View File

@ -0,0 +1,105 @@
#!/usr/bin/perl
#
#
# AF using CBQ for a single interface eth0
# 4 AF classes using GRED and one BE using RED
# Things you might want to change:
# - the device bandwidth (set at 10Mbits)
# - the bandwidth allocated for each AF class and the BE class
# - the drop probability associated with each AF virtual queue
#
# AF DSCP values used (based on AF draft 04)
# -----------------------------------------
# AF DSCP values
# AF1 1. 0x0a 2. 0x0c 3. 0x0e
# AF2 1. 0x12 2. 0x14 3. 0x16
# AF3 1. 0x1a 2. 0x1c 3. 0x1e
# AF4 1. 0x22 2. 0x24 3. 0x26
#
#
# A simple DSCP-class relationship formula used to generate
# values in the for loop of this script; $drop stands for the
# DP
# $dscp = ($class*8+$drop*2)
#
# if you use GRIO buffer sharing, then GRED priority is set as follows:
# $gprio=$drop+1;
#
$TC = "/usr/src/iproute2-current/tc/tc";
$DEV = "dev lo";
$DEV = "dev eth1";
$DEV = "dev eth0";
# the BE-class number
$beclass = "5";
#GRIO buffer sharing on or off?
$GRIO = "";
$GRIO = "grio";
# The bandwidth of your device
$linerate="10Mbit";
# The BE and AF rates
%rate_table=();
$berate="1500Kbit";
$rate_table{"AF1rate"}="1500Kbit";
$rate_table{"AF2rate"}="1500Kbit";
$rate_table{"AF3rate"}="1500Kbit";
$rate_table{"AF4rate"}="1500Kbit";
#
#
#
print "\n# --- General setup ---\n";
print "$TC qdisc add $DEV handle 1:0 root dsmark indices 64 set_tc_index\n";
print "$TC filter add $DEV parent 1:0 protocol ip prio 1 tcindex mask 0xfc " .
"shift 2 pass_on\n";
#"shift 2\n";
print "$TC qdisc add $DEV parent 1:0 handle 2:0 cbq bandwidth $linerate ".
"cell 8 avpkt 1000 mpu 64\n";
print "$TC filter add $DEV parent 2:0 protocol ip prio 1 tcindex ".
"mask 0xf0 shift 4 pass_on\n";
for $class (1..4) {
print "\n# --- AF Class $class specific setup---\n";
$AFrate=sprintf("AF%drate",$class);
print "$TC class add $DEV parent 2:0 classid 2:$class cbq ".
"bandwidth $linerate rate $rate_table{$AFrate} avpkt 1000 prio ".
(6-$class)." bounded allot 1514 weight 1 maxburst 21\n";
print "$TC filter add $DEV parent 2:0 protocol ip prio 1 handle $class ".
"tcindex classid 2:$class\n";
print "$TC qdisc add $DEV parent 2:$class gred setup DPs 3 default 2 ".
"$GRIO\n";
#
# per DP setup
#
for $drop (1..3) {
print "\n# --- AF Class $class DP $drop---\n";
$dscp = $class*8+$drop*2;
$tcindex = sprintf("1%x%x",$class,$drop);
print "$TC filter add $DEV parent 1:0 protocol ip prio 1 ".
"handle $dscp tcindex classid 1:$tcindex\n";
$prob = $drop*0.02;
if ($GRIO) {
$gprio = $drop+1;
print "$TC qdisc change $DEV parent 2:$class gred limit 60KB min 15KB ".
"max 45KB burst 20 avpkt 1000 bandwidth $linerate DP $drop ".
"probability $prob ".
"prio $gprio\n";
} else {
print "$TC qdisc change $DEV parent 2:$class gred limit 60KB min 15KB ".
"max 45KB burst 20 avpkt 1000 bandwidth $linerate DP $drop ".
"probability $prob \n";
}
}
}
#
#
print "\n#------BE Queue setup------\n";
print "$TC filter add $DEV parent 1:0 protocol ip prio 2 ".
"handle 0 tcindex mask 0 classid 1:1\n";
print "$TC class add $DEV parent 2:0 classid 2:$beclass cbq ".
"bandwidth $linerate rate $berate avpkt 1000 prio 6 " .
"bounded allot 1514 weight 1 maxburst 21 \n";
print "$TC filter add $DEV parent 2:0 protocol ip prio 1 handle 0 tcindex ".
"classid 2:5\n";
print "$TC qdisc add $DEV parent 2:5 red limit 60KB min 15KB max 45KB ".
"burst 20 avpkt 1000 bandwidth $linerate probability 0.4\n";

25
examples/diffserv/ef-prio Normal file
View File

@ -0,0 +1,25 @@
#!/usr/bin/perl
$TC = "/root/DS-6-beta/iproute2-990530-dsing/tc/tc";
$DEV = "dev eth1";
$efrate="1.5Mbit";
$MTU="1.5kB";
print "$TC qdisc add $DEV handle 1:0 root dsmark indices 64 set_tc_index\n";
print "$TC filter add $DEV parent 1:0 protocol ip prio 1 tcindex ".
"mask 0xfc shift 2\n";
print "$TC qdisc add $DEV parent 1:0 handle 2:0 prio\n";
#
# EF class: Maximum about one MTU sized packet allowed on the queue
#
print "$TC qdisc add $DEV parent 2:1 tbf rate $efrate burst $MTU limit 1.6kB\n";
print "$TC filter add $DEV parent 2:0 protocol ip prio 1 ".
"handle 0x2e tcindex classid 2:1 pass_on\n";
#
# BE class
#
print "#BE class(2:2) \n";
print "$TC qdisc add $DEV parent 2:2 red limit 60KB ".
"min 15KB max 45KB burst 20 avpkt 1000 bandwidth 10Mbit ".
"probability 0.4\n";
#
print "$TC filter add $DEV parent 2:0 protocol ip prio 2 ".
"handle 0 tcindex mask 0 classid 2:2 pass_on\n";

31
examples/diffserv/efcbq Normal file
View File

@ -0,0 +1,31 @@
#!/usr/bin/perl
#
$TC = "/root/DS-6-beta/iproute2-990530-dsing/tc/tc";
$DEV = "dev eth1";
print "$TC qdisc add $DEV handle 1:0 root dsmark indices 64 set_tc_index\n";
print "$TC filter add $DEV parent 1:0 protocol ip prio 1 tcindex ".
"mask 0xfc shift 2\n";
print "$TC qdisc add $DEV parent 1:0 handle 2:0 cbq bandwidth ".
"10Mbit cell 8 avpkt 1000 mpu 64\n";
#
# EF class
#
print "$TC class add $DEV parent 2:0 classid 2:1 cbq bandwidth ".
"10Mbit rate 1500Kbit avpkt 1000 prio 1 bounded isolated ".
"allot 1514 weight 1 maxburst 10 \n";
# packet fifo for EF?
print "$TC qdisc add $DEV parent 2:1 pfifo limit 5\n";
print "$TC filter add $DEV parent 2:0 protocol ip prio 1 ".
"handle 0x2e tcindex classid 2:1 pass_on\n";
#
# BE class
#
print "#BE class(2:2) \n";
print "$TC class add $DEV parent 2:0 classid 2:2 cbq bandwidth ".
"10Mbit rate 5Mbit avpkt 1000 prio 7 allot 1514 weight 1 ".
"maxburst 21 borrow split 2:0 defmap 0xffff \n";
print "$TC qdisc add $DEV parent 2:2 red limit 60KB ".
"min 15KB max 45KB burst 20 avpkt 1000 bandwidth 10Mbit ".
"probability 0.4\n";
print "$TC filter add $DEV parent 2:0 protocol ip prio 2 ".
"handle 0 tcindex mask 0 classid 2:2 pass_on\n";

View File

@ -0,0 +1,125 @@
These were the tests done to validate the Diffserv scripts.
This document will be updated continously. If you do more
thorough validation testing please post the details to the
diffserv mailing list.
Nevertheless, these tests should serve for basic validation.
AFCBQ, EFCBQ, EFPRIO
----------------------
generate all possible DSCPs and observe that they
get sent to the proper classes. In the case of AF also
to the correct Virtual Queues.
Edge1
-----
generate TOS values 0x0,0x10,0xbb each with IP addresses
10.2.0.24 (mark 1), 10.2.0.3 (mark2) and 10.2.0.30 (mark 3)
and observe that they get marked as expected.
Edge2
-----
-Repeat the tests in Edge1
-ftp with data direction from 10.2.0.2
*observe that the metering/policing works correctly (and the marking
as well). In this case the mark used will be 3
Edge31-cb-chains
----------------
-ftp with data direction from 10.2.0.2
*observe that the metering/policing works correctly (and the marking
as well). In this case the mark used will be 1.
Metering: The data throughput should not exceed 2*CIR1 + 2*CIR2
which is roughly: 5mbps
Marking: the should be a variation of marked packets:
AF41(TOS=0x88) AF42(0x90) AF43(0x98) and BE (0x0)
More tests required to see the interaction of several sources (other
than subnet 10.2.0.0/24).
Edge31-ca-u32
--------------
Generate data using modified tcpblast from 10.2.0.2 (behind eth2) to the
discard port of 10.1.0.2 (behind eth1)
1) generate with src tos = 0x88
Metering: Allocated throughput should not exceed 2*CIR1 + 2*CIR2
approximately 5mbps
Marking: Should vary between 0x88,0x90,0x98 and 0x0
2) generate with src tos = 0x90
Metering: Allocated throughput should not exceed CIR1 + 2*CIR2
approximately 3.5mbps
Marking: Should vary between 0x90,0x98 and 0x0
3) generate with src tos = 0x98
Metering: Allocated throughput should not exceed CIR1 + CIR2
approximately 2.5mbps
Marking: Should vary between 0x98 and 0x0
4) generate with src tos any other than the above
Metering: Allocated throughput should not exceed CIR1
approximately 1.5mbps
Marking: Should be consistent at 0x0
TODO: Testing on how each color shares when all 4 types of packets
are going through the edge device
Edge32-cb-u32, Edge32-cb-chains
-------------------------------
-ftp with data direction from 10.2.0.2
*observe that the metering/policing works correctly (and the marking
as well).
Metering:
The data throughput should not exceed 2*CIR1 + 2*CIR2
+ 2*PIR2 + PIR1 for u32 which is roughly: 6mbps
The data throughput should not exceed 2*CIR1 + 5*CIR2
for chains which is roughly: 6mbps
Marking: the should be a variation of marked packets:
AF41(TOS=0x88) AF42(0x90) AF43(0x98) and BE (0x0)
TODO:
-More tests required to see the interaction of several sources (other
than subnet 10.2.0.0/24).
-More tests needed to capture stats on how many times the CIR was exceeded
but the data was not remarked etc.
Edge32-ca-u32
--------------
Generate data using modified tcpblast from 10.2.0.2 (behind eth2) to the
discard port of 10.1.0.2 (behind eth1)
1) generate with src tos = 0x88
Metering: Allocated throughput should not exceed 2*CIR1 + 2*CIR2
+PIR1 -- approximately 4mbps
Marking: Should vary between 0x88,0x90,0x98 and 0x0
2) generate with src tos = 0x90
Metering: Allocated throughput should not exceed CIR1 + 2*CIR2
+ 2* PIR2 approximately 3mbps
Marking: Should vary between 0x90,0x98 and 0x0
3) generate with src tos = 0x98
Metering: Allocated throughput should not exceed PIR1+ CIR1 + CIR2
approximately 2.5mbps
Marking: Should vary between 0x98 and 0x0
4) generate with src tos any other than the above
Metering: Allocated throughput should not exceed CIR1
approximately 1mbps
Marking: Should be consistent at 0x0
TODO: Testing on how each color shares when all 4 types of packets
are going through the edge device

134
examples/gaiconf Normal file
View File

@ -0,0 +1,134 @@
#!/bin/sh
#
# Setup address label from /etc/gai.conf
#
# Written by YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>, 2010.
#
IP=ip
DEFAULT_GAICONF=/etc/gai.conf
verbose=
debug=
function run ()
{
if [ x"$verbose" != x"" ]; then
echo "$@"
fi
if [ x"$debug" = x"" ]; then
"$@"
fi
}
function do_load_config ()
{
file=$1; shift
flush=1
cat $file | while read command prefix label; do
if [ x"$command" = x"#label" ]; then
if [ ${flush} = 1 ]; then
run ${IP} -6 addrlabel flush
flush=0
fi
run ${IP} -6 addrlabel add prefix $prefix label $label
fi
done
}
function do_list_config ()
{
${IP} -6 addrlabel list | while read p pfx l lbl; do
echo label ${pfx} ${lbl}
done
}
function help ()
{
echo "Usage: $0 [-v] {--list | --config [ ${DEFAULT_GAICONF} ] | --default}"
exit 1
}
TEMP=`getopt -o c::dlv -l config::,default,list,verbose -n gaiconf -- "$@"`
if [ $? != 0 ]; then
echo "Terminating..." >&2
exit 1
fi
TEMPFILE=`mktemp`
eval set -- "$TEMP"
while true ; do
case "$1" in
-c|--config)
if [ x"$cmd" != x"" ]; then
help
fi
case "$2" in
"") gai_conf="${DEFAULT_GAICONF}"
shift 2
;;
*) gai_conf="$2"
shift 2
esac
cmd=config
;;
-d|--default)
if [ x"$cmd" != x"" ]; then
help
fi
gai_conf=${TEMPFILE}
cmd=config
;;
-l|--list)
if [ x"$cmd" != x"" ]; then
help
fi
cmd=list
shift
;;
-v)
verbose=1
shift
;;
--)
shift;
break
;;
*)
echo "Internal error!" >&2
exit 1
;;
esac
done
case "$cmd" in
config)
if [ x"$gai_conf" = x"${TEMPFILE}" ]; then
sed -e 's/^[[:space:]]*//' <<END_OF_DEFAULT >${TEMPFILE}
label ::1/128 0
label ::/0 1
label 2002::/16 2
label ::/96 3
label ::ffff:0:0/96 4
label fec0::/10 5
label fc00::/7 6
label 2001:0::/32 7
END_OF_DEFAULT
fi
do_load_config "$gai_conf"
;;
list)
do_list_config
;;
*)
help
;;
esac
rm -f "${TEMPFILE}"
exit 0

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
GENLOBJ=genl.o
include ../config.mk
include ../Config
SHARED_LIBS ?= y
CFLAGS += -fno-strict-aliasing
@ -21,7 +20,6 @@ endif
all: genl
genl: $(GENLOBJ) $(LIBNETLINK) $(LIBUTIL) $(GENLLIB)
$(QUIET_LINK)$(CC) $^ $(LDFLAGS) $(LDLIBS) -o $@
install: all
install -m 0755 genl $(DESTDIR)$(SBINDIR)

View File

@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <syslog.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netinet/in.h>
@ -28,18 +29,89 @@
static int usage(void)
{
fprintf(stderr,"Usage: ctrl <CMD>\n" \
"CMD := get <PARMS> | list | monitor | policy <PARMS>\n" \
"CMD := get <PARMS> | list | monitor\n" \
"PARMS := name <name> | id <id>\n" \
"Examples:\n" \
"\tctrl ls\n" \
"\tctrl monitor\n" \
"\tctrl get name foobar\n" \
"\tctrl get id 0xF\n"
"\tctrl policy name foobar\n"
"\tctrl policy id 0xF\n");
"\tctrl get id 0xF\n");
return -1;
}
int genl_ctrl_resolve_family(const char *family)
{
struct rtnl_handle rth;
struct nlmsghdr *nlh;
struct genlmsghdr *ghdr;
int ret = 0;
struct {
struct nlmsghdr n;
char buf[4096];
} req;
memset(&req, 0, sizeof(req));
nlh = &req.n;
nlh->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
nlh->nlmsg_type = GENL_ID_CTRL;
ghdr = NLMSG_DATA(&req.n);
ghdr->cmd = CTRL_CMD_GETFAMILY;
if (rtnl_open_byproto(&rth, 0, NETLINK_GENERIC) < 0) {
fprintf(stderr, "Cannot open generic netlink socket\n");
exit(1);
}
addattr_l(nlh, 128, CTRL_ATTR_FAMILY_NAME, family, strlen(family) + 1);
if (rtnl_talk(&rth, nlh, nlh, sizeof(req)) < 0) {
fprintf(stderr, "Error talking to the kernel\n");
goto errout;
}
{
struct rtattr *tb[CTRL_ATTR_MAX + 1];
struct genlmsghdr *ghdr = NLMSG_DATA(nlh);
int len = nlh->nlmsg_len;
struct rtattr *attrs;
if (nlh->nlmsg_type != GENL_ID_CTRL) {
fprintf(stderr, "Not a controller message, nlmsg_len=%d "
"nlmsg_type=0x%x\n", nlh->nlmsg_len, nlh->nlmsg_type);
goto errout;
}
if (ghdr->cmd != CTRL_CMD_NEWFAMILY) {
fprintf(stderr, "Unknown controller command %d\n", ghdr->cmd);
goto errout;
}
len -= NLMSG_LENGTH(GENL_HDRLEN);
if (len < 0) {
fprintf(stderr, "wrong controller message len %d\n", len);
return -1;
}
attrs = (struct rtattr *) ((char *) ghdr + GENL_HDRLEN);
parse_rtattr(tb, CTRL_ATTR_MAX, attrs, len);
if (tb[CTRL_ATTR_FAMILY_ID] == NULL) {
fprintf(stderr, "Missing family id TLV\n");
goto errout;
}
ret = rta_getattr_u16(tb[CTRL_ATTR_FAMILY_ID]);
}
errout:
rtnl_close(&rth);
return ret;
}
static void print_ctrl_cmd_flags(FILE *fp, __u32 fl)
{
fprintf(fp, "\n\t\tCapabilities (0x%x):\n ", fl);
@ -60,7 +132,7 @@ static void print_ctrl_cmd_flags(FILE *fp, __u32 fl)
fprintf(fp, "\n");
}
static int print_ctrl_cmds(FILE *fp, struct rtattr *arg, __u32 ctrl_ver)
{
struct rtattr *tb[CTRL_ATTR_OP_MAX + 1];
@ -105,8 +177,8 @@ static int print_ctrl_grp(FILE *fp, struct rtattr *arg, __u32 ctrl_ver)
/*
* The controller sends one nlmsg per family
*/
static int print_ctrl(struct rtnl_ctrl_data *ctrl,
struct nlmsghdr *n, void *arg)
static int print_ctrl(const struct sockaddr_nl *who, struct nlmsghdr *n,
void *arg)
{
struct rtattr *tb[CTRL_ATTR_MAX + 1];
struct genlmsghdr *ghdr = NLMSG_DATA(n);
@ -125,8 +197,7 @@ static int print_ctrl(struct rtnl_ctrl_data *ctrl,
ghdr->cmd != CTRL_CMD_DELFAMILY &&
ghdr->cmd != CTRL_CMD_NEWFAMILY &&
ghdr->cmd != CTRL_CMD_NEWMCAST_GRP &&
ghdr->cmd != CTRL_CMD_DELMCAST_GRP &&
ghdr->cmd != CTRL_CMD_GETPOLICY) {
ghdr->cmd != CTRL_CMD_DELMCAST_GRP) {
fprintf(stderr, "Unknown controller command %d\n", ghdr->cmd);
return 0;
}
@ -139,7 +210,7 @@ static int print_ctrl(struct rtnl_ctrl_data *ctrl,
}
attrs = (struct rtattr *) ((char *) ghdr + GENL_HDRLEN);
parse_rtattr_flags(tb, CTRL_ATTR_MAX, attrs, len, NLA_F_NESTED);
parse_rtattr(tb, CTRL_ATTR_MAX, attrs, len);
if (tb[CTRL_ATTR_FAMILY_NAME]) {
char *name = RTA_DATA(tb[CTRL_ATTR_FAMILY_NAME]);
@ -162,36 +233,6 @@ static int print_ctrl(struct rtnl_ctrl_data *ctrl,
__u32 *ma = RTA_DATA(tb[CTRL_ATTR_MAXATTR]);
fprintf(fp, " max attribs: %d ",*ma);
}
if (tb[CTRL_ATTR_OP_POLICY]) {
const struct rtattr *pos;
rtattr_for_each_nested(pos, tb[CTRL_ATTR_OP_POLICY]) {
struct rtattr *ptb[CTRL_ATTR_POLICY_DUMP_MAX + 1];
struct rtattr *pattrs = RTA_DATA(pos);
int plen = RTA_PAYLOAD(pos);
parse_rtattr_flags(ptb, CTRL_ATTR_POLICY_DUMP_MAX,
pattrs, plen, NLA_F_NESTED);
fprintf(fp, " op %d policies:",
pos->rta_type & ~NLA_F_NESTED);
if (ptb[CTRL_ATTR_POLICY_DO]) {
__u32 *v = RTA_DATA(ptb[CTRL_ATTR_POLICY_DO]);
fprintf(fp, " do=%d", *v);
}
if (ptb[CTRL_ATTR_POLICY_DUMP]) {
__u32 *v = RTA_DATA(ptb[CTRL_ATTR_POLICY_DUMP]);
fprintf(fp, " dump=%d", *v);
}
}
}
if (tb[CTRL_ATTR_POLICY])
nl_print_policy(tb[CTRL_ATTR_POLICY], fp);
/* end of family definitions .. */
fprintf(fp,"\n");
if (tb[CTRL_ATTR_OPS]) {
@ -240,37 +281,34 @@ static int print_ctrl(struct rtnl_ctrl_data *ctrl,
return 0;
}
static int print_ctrl2(struct nlmsghdr *n, void *arg)
{
return print_ctrl(NULL, n, arg);
}
static int ctrl_list(int cmd, int argc, char **argv)
{
struct rtnl_handle rth;
struct nlmsghdr *nlh;
struct genlmsghdr *ghdr;
int ret = -1;
char d[GENL_NAMSIZ];
struct {
struct nlmsghdr n;
struct genlmsghdr g;
char buf[4096];
} req = {
.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN),
.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
.n.nlmsg_type = GENL_ID_CTRL,
.g.cmd = CTRL_CMD_GETFAMILY,
};
struct nlmsghdr *nlh = &req.n;
struct nlmsghdr *answer = NULL;
} req;
memset(&req, 0, sizeof(req));
nlh = &req.n;
nlh->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
nlh->nlmsg_type = GENL_ID_CTRL;
ghdr = NLMSG_DATA(&req.n);
ghdr->cmd = CTRL_CMD_GETFAMILY;
if (rtnl_open_byproto(&rth, 0, NETLINK_GENERIC) < 0) {
fprintf(stderr, "Cannot open generic netlink socket\n");
exit(1);
}
if (cmd == CTRL_CMD_GETFAMILY || cmd == CTRL_CMD_GETPOLICY) {
req.g.cmd = cmd;
if (cmd == CTRL_CMD_GETFAMILY) {
if (argc != 2) {
fprintf(stderr, "Wrong number of params\n");
return -1;
@ -278,7 +316,7 @@ static int ctrl_list(int cmd, int argc, char **argv)
if (matches(*argv, "name") == 0) {
NEXT_ARG();
strlcpy(d, *argv, sizeof(d));
strncpy(d, *argv, sizeof (d) - 1);
addattr_l(nlh, 128, CTRL_ATTR_FAMILY_NAME,
d, strlen(d) + 1);
} else if (matches(*argv, "id") == 0) {
@ -295,22 +333,20 @@ static int ctrl_list(int cmd, int argc, char **argv)
fprintf(stderr, "Wrong params\n");
goto ctrl_done;
}
}
if (cmd == CTRL_CMD_GETFAMILY) {
if (rtnl_talk(&rth, nlh, &answer) < 0) {
if (rtnl_talk(&rth, nlh, nlh, sizeof(req)) < 0) {
fprintf(stderr, "Error talking to the kernel\n");
goto ctrl_done;
}
if (print_ctrl2(answer, (void *) stdout) < 0) {
if (print_ctrl(NULL, nlh, (void *) stdout) < 0) {
fprintf(stderr, "Dump terminated\n");
goto ctrl_done;
}
}
if (cmd == CTRL_CMD_UNSPEC || cmd == CTRL_CMD_GETPOLICY) {
if (cmd == CTRL_CMD_UNSPEC) {
nlh->nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST;
nlh->nlmsg_seq = rth.dump = ++rth.seq;
@ -319,13 +355,12 @@ static int ctrl_list(int cmd, int argc, char **argv)
goto ctrl_done;
}
rtnl_dump_filter(&rth, print_ctrl2, stdout);
rtnl_dump_filter(&rth, print_ctrl, stdout);
}
ret = 0;
ctrl_done:
free(answer);
rtnl_close(&rth);
return ret;
}
@ -361,8 +396,6 @@ static int parse_ctrl(struct genl_util *a, int argc, char **argv)
matches(*argv, "show") == 0 ||
matches(*argv, "lst") == 0)
return ctrl_list(CTRL_CMD_UNSPEC, argc-1, argv+1);
if (matches(*argv, "policy") == 0)
return ctrl_list(CTRL_CMD_GETPOLICY, argc-1, argv+1);
if (matches(*argv, "help") == 0)
return usage();
@ -375,5 +408,5 @@ static int parse_ctrl(struct genl_util *a, int argc, char **argv)
struct genl_util ctrl_genl_util = {
.name = "ctrl",
.parse_genlopt = parse_ctrl,
.print_genlopt = print_ctrl2,
.print_genlopt = print_ctrl,
};

View File

@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <syslog.h>
#include <fcntl.h>
#include <dlfcn.h>
#include <sys/socket.h>
@ -22,19 +23,21 @@
#include <errno.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h> /* until we put our own header */
#include "version.h"
#include "SNAPSHOT.h"
#include "utils.h"
#include "genl_utils.h"
int show_stats;
int show_details;
int show_raw;
int show_stats = 0;
int show_details = 0;
int show_raw = 0;
int resolve_hosts = 0;
static void *BODY;
static struct genl_util *genl_list;
static struct genl_util * genl_list;
static int print_nofopt(struct nlmsghdr *n, void *arg)
static int print_nofopt(const struct sockaddr_nl *who, struct nlmsghdr *n,
void *arg)
{
fprintf((FILE *) arg, "unknown genl type ..\n");
return 0;
@ -43,16 +46,15 @@ static int print_nofopt(struct nlmsghdr *n, void *arg)
static int parse_nofopt(struct genl_util *f, int argc, char **argv)
{
if (argc) {
fprintf(stderr,
"Unknown genl \"%s\", hence option \"%s\" is unparsable\n",
f->name, *argv);
fprintf(stderr, "Unknown genl \"%s\", hence option \"%s\" "
"is unparsable\n", f->name, *argv);
return -1;
}
return 0;
}
static struct genl_util *get_genl_kind(const char *str)
static struct genl_util *get_genl_kind(char *str)
{
void *dlh;
char buf[256];
@ -84,8 +86,9 @@ reg:
return f;
noexist:
f = calloc(1, sizeof(*f));
f = malloc(sizeof(*f));
if (f) {
memset(f, 0, sizeof(*f));
strncpy(f->name, str, 15);
f->parse_genlopt = parse_nofopt;
f->print_genlopt = print_nofopt;
@ -98,10 +101,9 @@ static void usage(void) __attribute__((noreturn));
static void usage(void)
{
fprintf(stderr,
"Usage: genl [ OPTIONS ] OBJECT [help] }\n"
"where OBJECT := { ctrl etc }\n"
" OPTIONS := { -s[tatistics] | -d[etails] | -r[aw] | -V[ersion] | -h[elp] }\n");
fprintf(stderr, "Usage: genl [ OPTIONS ] OBJECT | help }\n"
"where OBJECT := { ctrl etc }\n"
" OPTIONS := { -s[tatistics] | -d[etails] | -r[aw] }\n");
exit(-1);
}
@ -118,26 +120,24 @@ int main(int argc, char **argv)
} else if (matches(argv[1], "-raw") == 0) {
++show_raw;
} else if (matches(argv[1], "-Version") == 0) {
printf("genl utility, iproute2-%s\n", version);
printf("genl utility, iproute2-ss%s\n", SNAPSHOT);
exit(0);
} else if (matches(argv[1], "-help") == 0) {
usage();
} else {
fprintf(stderr,
"Option \"%s\" is unknown, try \"genl -help\".\n",
argv[1]);
fprintf(stderr, "Option \"%s\" is unknown, try "
"\"genl -help\".\n", argv[1]);
exit(-1);
}
argc--; argv++;
}
if (argc > 1) {
struct genl_util *a;
int ret;
struct genl_util *a = NULL;
a = get_genl_kind(argv[1]);
if (!a) {
fprintf(stderr, "bad genl %s\n", argv[1]);
fprintf(stderr,"bad genl %s\n", argv[1]);
exit(-1);
}

View File

@ -1,15 +1,17 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _TC_UTIL_H_
#define _TC_UTIL_H_ 1
#include <linux/genetlink.h>
#include "utils.h"
#include "linux/genetlink.h"
struct genl_util {
struct genl_util
{
struct genl_util *next;
char name[16];
int (*parse_genlopt)(struct genl_util *fu, int argc, char **argv);
int (*print_genlopt)(struct nlmsghdr *n, void *arg);
int (*print_genlopt)(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg);
};
extern int genl_ctrl_resolve_family(const char *family);
#endif

View File

@ -1,4 +1,3 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file creates a dummy version of dynamic loading
* for environments where dynamic linking

1
include/SNAPSHOT.h Normal file
View File

@ -0,0 +1 @@
static const char SNAPSHOT[] = "150706";

View File

@ -1,275 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BPF_API__
#define __BPF_API__
/* Note:
*
* This file can be included into eBPF kernel programs. It contains
* a couple of useful helper functions, map/section ABI (bpf_elf.h),
* misc macros and some eBPF specific LLVM built-ins.
*/
#include <stdint.h>
#include <linux/pkt_cls.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <asm/byteorder.h>
#include "bpf_elf.h"
/** libbpf pin type. */
enum libbpf_pin_type {
LIBBPF_PIN_NONE,
/* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */
LIBBPF_PIN_BY_NAME,
};
/** Type helper macros. */
#define __uint(name, val) int (*name)[val]
#define __type(name, val) typeof(val) *name
#define __array(name, val) typeof(val) *name[]
/** Misc macros. */
#ifndef __stringify
# define __stringify(X) #X
#endif
#ifndef __maybe_unused
# define __maybe_unused __attribute__((__unused__))
#endif
#ifndef offsetof
# define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER)
#endif
#ifndef likely
# define likely(X) __builtin_expect(!!(X), 1)
#endif
#ifndef unlikely
# define unlikely(X) __builtin_expect(!!(X), 0)
#endif
#ifndef htons
# define htons(X) __constant_htons((X))
#endif
#ifndef ntohs
# define ntohs(X) __constant_ntohs((X))
#endif
#ifndef htonl
# define htonl(X) __constant_htonl((X))
#endif
#ifndef ntohl
# define ntohl(X) __constant_ntohl((X))
#endif
#ifndef __inline__
# define __inline__ __attribute__((always_inline))
#endif
/** Section helper macros. */
#ifndef __section
# define __section(NAME) \
__attribute__((section(NAME), used))
#endif
#ifndef __section_tail
# define __section_tail(ID, KEY) \
__section(__stringify(ID) "/" __stringify(KEY))
#endif
#ifndef __section_xdp_entry
# define __section_xdp_entry \
__section(ELF_SECTION_PROG)
#endif
#ifndef __section_cls_entry
# define __section_cls_entry \
__section(ELF_SECTION_CLASSIFIER)
#endif
#ifndef __section_act_entry
# define __section_act_entry \
__section(ELF_SECTION_ACTION)
#endif
#ifndef __section_lwt_entry
# define __section_lwt_entry \
__section(ELF_SECTION_PROG)
#endif
#ifndef __section_license
# define __section_license \
__section(ELF_SECTION_LICENSE)
#endif
#ifndef __section_maps
# define __section_maps \
__section(ELF_SECTION_MAPS)
#endif
/** Declaration helper macros. */
#ifndef BPF_LICENSE
# define BPF_LICENSE(NAME) \
char ____license[] __section_license = NAME
#endif
/** Classifier helper */
#ifndef BPF_H_DEFAULT
# define BPF_H_DEFAULT -1
#endif
/** BPF helper functions for tc. Individual flags are in linux/bpf.h */
#ifndef __BPF_FUNC
# define __BPF_FUNC(NAME, ...) \
(* NAME)(__VA_ARGS__) __maybe_unused
#endif
#ifndef BPF_FUNC
# define BPF_FUNC(NAME, ...) \
__BPF_FUNC(NAME, __VA_ARGS__) = (void *) BPF_FUNC_##NAME
#endif
/* Map access/manipulation */
static void *BPF_FUNC(map_lookup_elem, void *map, const void *key);
static int BPF_FUNC(map_update_elem, void *map, const void *key,
const void *value, uint32_t flags);
static int BPF_FUNC(map_delete_elem, void *map, const void *key);
/* Time access */
static uint64_t BPF_FUNC(ktime_get_ns);
/* Debugging */
/* FIXME: __attribute__ ((format(printf, 1, 3))) not possible unless
* llvm bug https://llvm.org/bugs/show_bug.cgi?id=26243 gets resolved.
* It would require ____fmt to be made const, which generates a reloc
* entry (non-map).
*/
static void BPF_FUNC(trace_printk, const char *fmt, int fmt_size, ...);
#ifndef printt
# define printt(fmt, ...) \
({ \
char ____fmt[] = fmt; \
trace_printk(____fmt, sizeof(____fmt), ##__VA_ARGS__); \
})
#endif
/* Random numbers */
static uint32_t BPF_FUNC(get_prandom_u32);
/* Tail calls */
static void BPF_FUNC(tail_call, struct __sk_buff *skb, void *map,
uint32_t index);
/* System helpers */
static uint32_t BPF_FUNC(get_smp_processor_id);
static uint32_t BPF_FUNC(get_numa_node_id);
/* Packet misc meta data */
static uint32_t BPF_FUNC(get_cgroup_classid, struct __sk_buff *skb);
static int BPF_FUNC(skb_under_cgroup, void *map, uint32_t index);
static uint32_t BPF_FUNC(get_route_realm, struct __sk_buff *skb);
static uint32_t BPF_FUNC(get_hash_recalc, struct __sk_buff *skb);
static uint32_t BPF_FUNC(set_hash_invalid, struct __sk_buff *skb);
/* Packet redirection */
static int BPF_FUNC(redirect, int ifindex, uint32_t flags);
static int BPF_FUNC(clone_redirect, struct __sk_buff *skb, int ifindex,
uint32_t flags);
/* Packet manipulation */
static int BPF_FUNC(skb_load_bytes, struct __sk_buff *skb, uint32_t off,
void *to, uint32_t len);
static int BPF_FUNC(skb_store_bytes, struct __sk_buff *skb, uint32_t off,
const void *from, uint32_t len, uint32_t flags);
static int BPF_FUNC(l3_csum_replace, struct __sk_buff *skb, uint32_t off,
uint32_t from, uint32_t to, uint32_t flags);
static int BPF_FUNC(l4_csum_replace, struct __sk_buff *skb, uint32_t off,
uint32_t from, uint32_t to, uint32_t flags);
static int BPF_FUNC(csum_diff, const void *from, uint32_t from_size,
const void *to, uint32_t to_size, uint32_t seed);
static int BPF_FUNC(csum_update, struct __sk_buff *skb, uint32_t wsum);
static int BPF_FUNC(skb_change_type, struct __sk_buff *skb, uint32_t type);
static int BPF_FUNC(skb_change_proto, struct __sk_buff *skb, uint32_t proto,
uint32_t flags);
static int BPF_FUNC(skb_change_tail, struct __sk_buff *skb, uint32_t nlen,
uint32_t flags);
static int BPF_FUNC(skb_pull_data, struct __sk_buff *skb, uint32_t len);
/* Event notification */
static int __BPF_FUNC(skb_event_output, struct __sk_buff *skb, void *map,
uint64_t index, const void *data, uint32_t size) =
(void *) BPF_FUNC_perf_event_output;
/* Packet vlan encap/decap */
static int BPF_FUNC(skb_vlan_push, struct __sk_buff *skb, uint16_t proto,
uint16_t vlan_tci);
static int BPF_FUNC(skb_vlan_pop, struct __sk_buff *skb);
/* Packet tunnel encap/decap */
static int BPF_FUNC(skb_get_tunnel_key, struct __sk_buff *skb,
struct bpf_tunnel_key *to, uint32_t size, uint32_t flags);
static int BPF_FUNC(skb_set_tunnel_key, struct __sk_buff *skb,
const struct bpf_tunnel_key *from, uint32_t size,
uint32_t flags);
static int BPF_FUNC(skb_get_tunnel_opt, struct __sk_buff *skb,
void *to, uint32_t size);
static int BPF_FUNC(skb_set_tunnel_opt, struct __sk_buff *skb,
const void *from, uint32_t size);
/** LLVM built-ins, mem*() routines work for constant size */
#ifndef lock_xadd
# define lock_xadd(ptr, val) ((void) __sync_fetch_and_add(ptr, val))
#endif
#ifndef memset
# define memset(s, c, n) __builtin_memset((s), (c), (n))
#endif
#ifndef memcpy
# define memcpy(d, s, n) __builtin_memcpy((d), (s), (n))
#endif
#ifndef memmove
# define memmove(d, s, n) __builtin_memmove((d), (s), (n))
#endif
/* FIXME: __builtin_memcmp() is not yet fully useable unless llvm bug
* https://llvm.org/bugs/show_bug.cgi?id=26218 gets resolved. Also
* this one would generate a reloc entry (non-map), otherwise.
*/
#if 0
#ifndef memcmp
# define memcmp(a, b, n) __builtin_memcmp((a), (b), (n))
#endif
#endif
unsigned long long load_byte(void *skb, unsigned long long off)
asm ("llvm.bpf.load.byte");
unsigned long long load_half(void *skb, unsigned long long off)
asm ("llvm.bpf.load.half");
unsigned long long load_word(void *skb, unsigned long long off)
asm ("llvm.bpf.load.word");
#endif /* __BPF_API__ */

View File

@ -1,4 +1,3 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BPF_ELF__
#define __BPF_ELF__
@ -16,38 +15,19 @@
/* ELF section names, etc */
#define ELF_SECTION_LICENSE "license"
#define ELF_SECTION_MAPS "maps"
#define ELF_SECTION_PROG "prog"
#define ELF_SECTION_CLASSIFIER "classifier"
#define ELF_SECTION_ACTION "action"
#define ELF_MAX_MAPS 64
#define ELF_MAX_LICENSE_LEN 128
/* Object pinning settings */
#define PIN_NONE 0
#define PIN_OBJECT_NS 1
#define PIN_GLOBAL_NS 2
/* ELF map definition */
struct bpf_elf_map {
__u32 type;
__u32 size_key;
__u32 size_value;
__u32 max_elem;
__u32 flags;
__u32 id;
__u32 pinning;
__u32 inner_id;
__u32 inner_idx;
};
#define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) \
struct ____btf_map_##name { \
type_key key; \
type_val value; \
}; \
struct ____btf_map_##name \
__attribute__ ((section(".maps." #name), used)) \
____btf_map_##name = { }
#endif /* __BPF_ELF__ */

View File

@ -1,10 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BPF_SCM__
#define __BPF_SCM__
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include "utils.h"
#include "bpf_elf.h"

View File

@ -1,327 +0,0 @@
/*
* bpf_util.h BPF common code
*
* This program is free software; you can distribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Daniel Borkmann <daniel@iogearbox.net>
* Jiri Pirko <jiri@resnulli.us>
*/
#ifndef __BPF_UTIL__
#define __BPF_UTIL__
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/filter.h>
#include <linux/magic.h>
#include <linux/elf-em.h>
#include <linux/if_alg.h>
#include "utils.h"
#include "bpf_scm.h"
#define BPF_ENV_UDS "TC_BPF_UDS"
#define BPF_ENV_MNT "TC_BPF_MNT"
#ifndef BPF_MAX_LOG
# define BPF_MAX_LOG 4096
#endif
#define BPF_DIR_GLOBALS "globals"
#ifndef BPF_FS_MAGIC
# define BPF_FS_MAGIC 0xcafe4a11
#endif
#define BPF_DIR_MNT "/sys/fs/bpf"
#ifndef TRACEFS_MAGIC
# define TRACEFS_MAGIC 0x74726163
#endif
#define TRACE_DIR_MNT "/sys/kernel/tracing"
#ifndef AF_ALG
# define AF_ALG 38
#endif
#ifndef EM_BPF
# define EM_BPF 247
#endif
struct bpf_cfg_ops {
void (*cbpf_cb)(void *nl, const struct sock_filter *ops, int ops_len);
void (*ebpf_cb)(void *nl, int fd, const char *annotation);
};
enum bpf_mode {
CBPF_BYTECODE,
CBPF_FILE,
EBPF_OBJECT,
EBPF_PINNED,
BPF_MODE_MAX,
};
struct bpf_cfg_in {
const char *object;
const char *section;
const char *uds;
enum bpf_prog_type type;
enum bpf_mode mode;
__u32 ifindex;
bool verbose;
int argc;
char **argv;
struct sock_filter opcodes[BPF_MAXINSNS];
union {
int n_opcodes;
int prog_fd;
};
};
/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
#define BPF_ALU64_REG(OP, DST, SRC) \
((struct bpf_insn) { \
.code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = 0 })
#define BPF_ALU32_REG(OP, DST, SRC) \
((struct bpf_insn) { \
.code = BPF_ALU | BPF_OP(OP) | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = 0 })
/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
#define BPF_ALU64_IMM(OP, DST, IMM) \
((struct bpf_insn) { \
.code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \
.dst_reg = DST, \
.src_reg = 0, \
.off = 0, \
.imm = IMM })
#define BPF_ALU32_IMM(OP, DST, IMM) \
((struct bpf_insn) { \
.code = BPF_ALU | BPF_OP(OP) | BPF_K, \
.dst_reg = DST, \
.src_reg = 0, \
.off = 0, \
.imm = IMM })
/* Short form of mov, dst_reg = src_reg */
#define BPF_MOV64_REG(DST, SRC) \
((struct bpf_insn) { \
.code = BPF_ALU64 | BPF_MOV | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = 0 })
#define BPF_MOV32_REG(DST, SRC) \
((struct bpf_insn) { \
.code = BPF_ALU | BPF_MOV | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = 0 })
/* Short form of mov, dst_reg = imm32 */
#define BPF_MOV64_IMM(DST, IMM) \
((struct bpf_insn) { \
.code = BPF_ALU64 | BPF_MOV | BPF_K, \
.dst_reg = DST, \
.src_reg = 0, \
.off = 0, \
.imm = IMM })
#define BPF_MOV32_IMM(DST, IMM) \
((struct bpf_insn) { \
.code = BPF_ALU | BPF_MOV | BPF_K, \
.dst_reg = DST, \
.src_reg = 0, \
.off = 0, \
.imm = IMM })
/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
#define BPF_LD_IMM64(DST, IMM) \
BPF_LD_IMM64_RAW(DST, 0, IMM)
#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
((struct bpf_insn) { \
.code = BPF_LD | BPF_DW | BPF_IMM, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = (__u32) (IMM) }), \
((struct bpf_insn) { \
.code = 0, /* zero is reserved opcode */ \
.dst_reg = 0, \
.src_reg = 0, \
.off = 0, \
.imm = ((__u64) (IMM)) >> 32 })
#ifndef BPF_PSEUDO_MAP_FD
# define BPF_PSEUDO_MAP_FD 1
#endif
/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
#define BPF_LD_MAP_FD(DST, MAP_FD) \
BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
#define BPF_LD_ABS(SIZE, IMM) \
((struct bpf_insn) { \
.code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \
.dst_reg = 0, \
.src_reg = 0, \
.off = 0, \
.imm = IMM })
/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
((struct bpf_insn) { \
.code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = OFF, \
.imm = 0 })
/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \
((struct bpf_insn) { \
.code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = OFF, \
.imm = 0 })
/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \
((struct bpf_insn) { \
.code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \
.dst_reg = DST, \
.src_reg = 0, \
.off = OFF, \
.imm = IMM })
/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
#define BPF_JMP_REG(OP, DST, SRC, OFF) \
((struct bpf_insn) { \
.code = BPF_JMP | BPF_OP(OP) | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = OFF, \
.imm = 0 })
/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
((struct bpf_insn) { \
.code = BPF_JMP | BPF_OP(OP) | BPF_K, \
.dst_reg = DST, \
.src_reg = 0, \
.off = OFF, \
.imm = IMM })
/* Raw code statement block */
#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \
((struct bpf_insn) { \
.code = CODE, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = OFF, \
.imm = IMM })
/* Program exit */
#define BPF_EXIT_INSN() \
((struct bpf_insn) { \
.code = BPF_JMP | BPF_EXIT, \
.dst_reg = 0, \
.src_reg = 0, \
.off = 0, \
.imm = 0 })
int bpf_parse_common(struct bpf_cfg_in *cfg, const struct bpf_cfg_ops *ops);
int bpf_load_common(struct bpf_cfg_in *cfg, const struct bpf_cfg_ops *ops,
void *nl);
int bpf_parse_and_load_common(struct bpf_cfg_in *cfg,
const struct bpf_cfg_ops *ops, void *nl);
const char *bpf_prog_to_default_section(enum bpf_prog_type type);
int bpf_graft_map(const char *map_path, uint32_t *key, int argc, char **argv);
int bpf_trace_pipe(void);
void bpf_print_ops(struct rtattr *bpf_ops, __u16 len);
int bpf_prog_load_dev(enum bpf_prog_type type, const struct bpf_insn *insns,
size_t size_insns, const char *license, __u32 ifindex,
char *log, size_t size_log);
int bpf_program_load(enum bpf_prog_type type, const struct bpf_insn *insns,
size_t size_insns, const char *license, char *log,
size_t size_log);
int bpf_prog_attach_fd(int prog_fd, int target_fd, enum bpf_attach_type type);
int bpf_prog_detach_fd(int target_fd, enum bpf_attach_type type);
int bpf_program_attach(int prog_fd, int target_fd, enum bpf_attach_type type);
int bpf_dump_prog_info(FILE *f, uint32_t id);
#ifdef HAVE_ELF
int bpf_send_map_fds(const char *path, const char *obj);
int bpf_recv_map_fds(const char *path, int *fds, struct bpf_map_aux *aux,
unsigned int entries);
#ifdef HAVE_LIBBPF
int iproute2_bpf_elf_ctx_init(struct bpf_cfg_in *cfg);
int iproute2_bpf_fetch_ancillary(void);
int iproute2_get_root_path(char *root_path, size_t len);
bool iproute2_is_pin_map(const char *libbpf_map_name, char *pathname);
bool iproute2_is_map_in_map(const char *libbpf_map_name, struct bpf_elf_map *imap,
struct bpf_elf_map *omap, char *omap_name);
int iproute2_find_map_name_by_id(unsigned int map_id, char *name);
int iproute2_load_libbpf(struct bpf_cfg_in *cfg);
#endif /* HAVE_LIBBPF */
#else
static inline int bpf_send_map_fds(const char *path, const char *obj)
{
return 0;
}
static inline int bpf_recv_map_fds(const char *path, int *fds,
struct bpf_map_aux *aux,
unsigned int entries)
{
return -1;
}
#ifdef HAVE_LIBBPF
static inline int iproute2_load_libbpf(struct bpf_cfg_in *cfg)
{
fprintf(stderr, "No ELF library support compiled in.\n");
return -1;
}
#endif /* HAVE_LIBBPF */
#endif /* HAVE_ELF */
const char *get_libbpf_version(void);
#endif /* __BPF_UTIL__ */

Some files were not shown because too many files have changed in this diff Show More