diff --git a/.gitignore b/.gitignore index 98d83c5d..ef03b174 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ doc/*.ps doc/*.dvi doc/*.html doc/*.pdf +doc/*.out diff --git a/doc/Makefile b/doc/Makefile index e9c0ff79..0c51872a 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -1,4 +1,4 @@ -PSFILES=ip-cref.ps ip-tunnels.ps api-ip6-flowlabels.ps ss.ps nstat.ps arpd.ps rtstat.ps +PSFILES=ip-cref.ps ip-tunnels.ps api-ip6-flowlabels.ps ss.ps nstat.ps arpd.ps rtstat.ps tc-filters.ps # tc-cref.ps # api-rtnl.tex api-pmtudisc.tex api-news.tex # iki-netdev.ps iki-neighdst.ps diff --git a/doc/ip-cref.tex b/doc/ip-cref.tex index 67094c95..242cc266 100644 --- a/doc/ip-cref.tex +++ b/doc/ip-cref.tex @@ -2049,9 +2049,6 @@ table \verb|local| (ID 255). The \verb|local| table is a special routing table containing high priority control routes for local and broadcast addresses. -Rule 0 is special. It cannot be deleted or overridden. - - \item Priority: 32766, Selector: match anything, Action: lookup routing table \verb|main| (ID 254). The \verb|main| table is the normal routing table containing all non-policy diff --git a/doc/tc-filters.tex b/doc/tc-filters.tex new file mode 100644 index 00000000..59127d66 --- /dev/null +++ b/doc/tc-filters.tex @@ -0,0 +1,529 @@ +\documentclass[12pt,twoside]{article} + +\usepackage[hidelinks]{hyperref} % \url +\usepackage{booktabs} % nicer tabulars +\usepackage{fancyvrb} +\usepackage{fullpage} +\usepackage{float} + +\newcommand{\iface}{\textit} +\newcommand{\cmd}{\texttt} +\newcommand{\man}{\textit} +\newcommand{\qdisc}{\texttt} +\newcommand{\filter}{\texttt} + +\begin{document} +\title{QoS in Linux with TC and Filters} +\author{Phil Sutter (phil@nwl.cc)} +\date{January 2016} +\maketitle + +TC, the Traffic Control utility, has been there for a very long time - forever +in my humble perception. It is still (and has ever been if I'm not mistaken) the +only tool to configure QoS in Linux. + +Standard practice when transmitting packets over a medium which may block (due +to congestion, e.g.) is to use a queue which temporarily holds these packets. In +Linux, this queueing approach is where QoS happens: A Queueing Discipline +(qdisc) holds multiple packet queues with different priorities for dequeueing to +the network driver. The classification (i.e. deciding which queue a packet +should go into) is typically done based on Type Of Service (IPv4) or Traffic +Class (IPv6) header fields but depending on qdisc implementation, might be +controlled by the user as well. + +Qdiscs come in two flavors, classful or classless. While classless qdiscs are +not as flexible as classful ones, they also require much less customizing. Often +it is enough to just attach them to an interface, without exact knowledge of +what is done internally. Classful qdiscs are the exact opposite: flexible in +application, they are often not even usable without insightful configuration. + +As the name implies, classful qdiscs provide configurable classes to sort +traffic into. In it's basic form, this is not much different than, say, the +classless \qdisc{pfifo\_fast} which holds three queues and classifies per +packet upon priority field. Though typically classes go beyond that by +supporting nesting and additional characteristics like e.g. maximum traffic +rate or quantum. + +When it comes to controlling the classification process, filters come into play. +They attach to the parent of a set of classes (i.e. either the qdisc itself or +a parent class) and specify how a packet (or it's associated flow) has to look +like in order to suit a given class. To overcome this simplification, it is +possible to attach multiple filters to the same parent, which then consults each +of them in row until the first one accepts the packet. + +Before getting into detail about what filters there are and how to use them, a +simple setup of a qdisc with classes is necessary: +\begin{figure}[H] +\begin{Verbatim} + .-------------------------------------------------------. + | | + | HTB | + | | + | .----------------------------------------------------.| + | | || + | | Class 1:1 || + | | || + | | .---------------..---------------..---------------.|| + | | | || || ||| + | | | Class 1:10 || Class 1:20 || Class 1:30 ||| + | | | || || ||| + | | | .------------.|| .------------.|| .------------.||| + | | | | ||| | ||| | |||| + | | | | fq_codel ||| | fq_codel ||| | fq_codel |||| + | | | | ||| | ||| | |||| + | | | '------------'|| '------------'|| '------------'||| + | | '---------------''---------------''---------------'|| + | '----------------------------------------------------'| + '-------------------------------------------------------' +\end{Verbatim} +\end{figure} +\noindent +The following commands establish the basic setup shown: +\begin{Verbatim} +(1) # tc qdisc replace dev eth0 root handle 1: htb default 30 +(2) # tc class add dev eth0 parent 1: classid 1:1 htb rate 95mbit +(3) # alias tclass='tc class add dev eth0 parent 1:1' +(4) # tclass classid 1:10 htb rate 1mbit ceil 20mbit prio 1 +(4) # tclass classid 1:20 htb rate 90mbit ceil 95mbit prio 2 +(4) # tclass classid 1:30 htb rate 1mbit ceil 95mbit prio 3 +(5) # tc qdisc add dev eth0 parent 1:10 fq_codel +(5) # tc qdisc add dev eth0 parent 1:20 fq_codel +(5) # tc qdisc add dev eth0 parent 1:30 fq_codel +\end{Verbatim} +A little explanation for the unfamiliar reader: +\begin{enumerate} +\item Replace the root qdisc of \iface{eth0} by an instance of \qdisc{HTB}. + Specifying the handle is necessary so it can be referenced in consecutive + calls to \cmd{tc}. The default class for unclassified traffic is set to + 30. +\item Create a single top-level class with handle 1:1 which limits the total + bandwidth allowed to 95mbit/s. It is assumed that \iface{eth0} is a 100mbit/s link, + staying a little below that helps to keep the main point of enqueueing in + the qdisc layer instead of the interface hardware queue or at another + bottleneck in the network. +\item Define an alias for the common part of the remaining three calls in order + to improve readability. This means all remaining classes are attached to the + common parent class from (2). +\item Create three child classes for different uses: Class 1:10 has highest + priority but is tightly limited in bandwidth - fine for interactive + connections. Class 1:20 has mid priority and high guaranteed bandwidth, for + high priority bulk traffic. Finally, there's the default class 1:30 with + lowest priority, low guaranteed bandwidth and the ability to use the full + link in case it's unused otherwise. This should be fine for uninteresting + traffic not explicitly taken care of. +\item Attach a leaf qdisc to each of the child classes created in (4). Since + \qdisc{HTB} by default attaches \qdisc{pfifo} as leaf qdisc, this step is optional. Still, + the fairness between different flows provided by the classless \qdisc{fq\_codel} is + worth the effort. +\end{enumerate} +More information about the qdiscs and fine-tuning parameters can be found in +\man{tc-htb(8)} and \man{tc-fq\_codel(8)}. + +Without any additional setup done, now all traffic leaving \iface{eth0} is shaped to +95mbit/s and directed through class 1:30. This can be verified by looking at the +\texttt{Sent} field of the class statistics printed via \cmd{tc -s class show dev eth0}: +Only the root class 1:1 and it's child 1:30 should show any traffic. + + +\section*{Finally time to start filtering!} + +Let's begin with a simple one, i.e. reestablishing what \qdisc{pfifo\_fast} did +automatically based on TOS/Priority field. Linux internally translates the +header field into the priority field of struct skbuff, which +\qdisc{pfifo\_fast} uses for +classification. \man{tc-prio(8)} contains a table listing the priority (and +ultimately, \qdisc{pfifo\_fast} queue index) each TOS value is being translated into. +Here is a shorter version: +\begin{center} +\begin{tabular}{lll} +TOS Values & Linux Priority (Number) & Queue Index \\ +\midrule +0x0 - 0x6 & Best Effort (0) & 1 \\ +0x8 - 0xe & Bulk (2) & 2 \\ +0x10 - 0x16 & Interactive (6) & 0 \\ +0x18 - 0x1e & Interactive Bulk (4) & 1 \\ +\end{tabular} +\end{center} +Using the \filter{basic} filter, it is possible to match packets based on that skbuff +field, which has the added benefit of being IP version agnostic. Since the +\qdisc{HTB} setup above defaults to class ID 1:30, the Bulk priority can be +ignored. The \filter{basic} filter allows to combine matches, therefore we get along +with only two filters: +\begin{Verbatim} +# tc filter add dev eth0 parent 1: basic \ + match 'meta(priority eq 6)' classid 1:10 +# tc filter add dev eth0 parent 1: basic \ + match 'meta(priority eq 0)' \ + or 'meta(priority eq 4)' classid 1:20 +\end{Verbatim} +A detailed description of the \filter{basic} filter and the ematch syntax it uses can be +found in \man{tc-basic(8)} and \man{tc-ematch(8)}. + +Obviously, this first example cries for optimization. A simple one would be to +just change the default class from 1:30 to 1:20, so filters are only needed for +Bulk and Interactive priorities: +\begin{Verbatim} +# tc filter add dev eth0 parent 1: basic \ + match 'meta(priority eq 6)' classid 1:10 +# tc filter add dev eth0 parent 1: basic \ + match 'meta(priority eq 2)' classid 1:20 +\end{Verbatim} +Given that class IDs are random, choosing them wisely allows for a direct +mapping. So first, recreate the qdisc and classes configuration: +\begin{Verbatim} +# tc qdisc replace dev eth0 root handle 1: htb default 10 +# tc class add dev eth0 parent 1: classid 1:1 htb rate 95mbit +# alias tclass='tc class add dev eth0 parent 1:1' +# tclass classid 1:16 htb rate 1mbit ceil 20mbit prio 1 +# tclass classid 1:10 htb rate 90mbit ceil 95mbit prio 2 +# tclass classid 1:12 htb rate 1mbit ceil 95mbit prio 3 +# tc qdisc add dev eth0 parent 1:16 fq_codel +# tc qdisc add dev eth0 parent 1:10 fq_codel +# tc qdisc add dev eth0 parent 1:12 fq_codel +\end{Verbatim} +This is basically identical to above, but with changed leaf class IDs and the +second priority class being the default. Using the \filter{flow} filter with it's \texttt{map} +functionality, a single filter command is enough: +\begin{Verbatim} +# tc filter add dev eth0 parent 1: handle 0x1337 flow \ + map key priority baseclass 1:10 +\end{Verbatim} +The \filter{flow} filter now uses the priority value to construct a destination class ID +by adding it to the value of \texttt{baseclass}. While this works for priority values of +0, 2 and 6, it will result in non-existent class ID 1:14 for Interactive Bulk +traffic. In that case, the \qdisc{HTB} default applies so that traffic goes into class +ID 1:10 just as intended. Please note that specifying a handle is a mandatory +requirement by the \filter{flow} filter, although I didn't see where one would use that +later. For more information about \filter{flow}, see \man{tc-flow(8)}. + +While \filter{flow} and \filter{basic} filters are relatively easy to apply and understand, they +are as well quite limited to their intended purpose. A more flexible option is +the \filter{u32} filter, which allows to match on arbitrary parts of the packet data - +yet only on that, not any meta data associated to it by the kernel (with the +exception of firewall mark value). So in order to continue this little +exercise with \filter{u32}, we have to base classification directly upon the actual TOS +value. An intuitive attempt might look like this: +\begin{Verbatim} +# alias tcfilter='tc filter add dev eth0 parent 1:' +# tcfilter u32 match ip dsfield 0x10 0x1e classid 1:16 +# tcfilter u32 match ip dsfield 0x12 0x1e classid 1:16 +# tcfilter u32 match ip dsfield 0x14 0x1e classid 1:16 +# tcfilter u32 match ip dsfield 0x16 0x1e classid 1:16 +# tcfilter u32 match ip dsfield 0x8 0x1e classid 1:12 +# tcfilter u32 match ip dsfield 0xa 0x1e classid 1:12 +# tcfilter u32 match ip dsfield 0xc 0x1e classid 1:12 +# tcfilter u32 match ip dsfield 0xe 0x1e classid 1:12 +\end{Verbatim} +The obvious drawback here is the amount of filters needed. And without the +default class, eight more filters would be necessary. This also has performance +implications: A packet with TOS value 0xe will be checked eight times in total +in order to determine it's destination class. While there's not much to be done +about the number of filters, at least the performance problem can be eliminated +by using \filter{u32}'s hash table support: +\begin{Verbatim} +# tc filter add dev eth0 parent 1: prio 99 handle 1: u32 divisor 16 +\end{Verbatim} +This creates a hash table with 16 buckets. The table size is arbitrary, but not +random: Since the first bit of the TOS field is not interesting, it can be +ignored and therefore the range of values to consider is just [0;15], i.e. a +number of 16 different values. The next step is to populate the hash table: +\begin{Verbatim} +# alias tcfilter='tc filter add dev eth0 parent 1: prio 99' +# tcfilter u32 match u8 0 0 ht 1:0: classid 1:16 +# tcfilter u32 match u8 0 0 ht 1:1: classid 1:16 +# tcfilter u32 match u8 0 0 ht 1:2: classid 1:16 +# tcfilter u32 match u8 0 0 ht 1:3: classid 1:16 +# tcfilter u32 match u8 0 0 ht 1:4: classid 1:12 +# tcfilter u32 match u8 0 0 ht 1:5: classid 1:12 +# tcfilter u32 match u8 0 0 ht 1:6: classid 1:12 +# tcfilter u32 match u8 0 0 ht 1:7: classid 1:12 +# tcfilter u32 match u8 0 0 ht 1:8: classid 1:16 +# tcfilter u32 match u8 0 0 ht 1:9: classid 1:16 +# tcfilter u32 match u8 0 0 ht 1:a: classid 1:16 +# tcfilter u32 match u8 0 0 ht 1:b: classid 1:16 +# tcfilter u32 match u8 0 0 ht 1:c: classid 1:10 +# tcfilter u32 match u8 0 0 ht 1:d: classid 1:10 +# tcfilter u32 match u8 0 0 ht 1:e: classid 1:10 +# tcfilter u32 match u8 0 0 ht 1:f: classid 1:10 +\end{Verbatim} +The parameter \texttt{ht} denotes the hash table and bucket the filter should be added +to. Since the first TOS bit is ignored, it's value has to be divided by two in +order to get to the bucket it maps to. E.g. a TOS value of 0x10 will therefore +map to bucket 0x8. For the sake of completeness, all possible values are mapped +and therefore a configurable default class is not required. Note that the used +match expression is not necessary, but mandatory. Therefore anything that +matches any packet will suffice. Finally, a filter which links to the defined +hash table is needed: +\begin{Verbatim} +# tc filter add dev eth0 parent 1: prio 1 protocol ip u32 \ + link 1: hashkey mask 0x001e0000 match u8 0 0 +\end{Verbatim} +Here again, the actual match statement is not necessary, but syntactically +required. All the magic lies within the \texttt{hashkey} parameter, which defines which +part of the packet should be used directly as hash key. Here's a drawing of the +first four bytes of the IPv4 header, with the area selected by \texttt{hashkey mask} +highlighted: +\begin{figure}[H] +\begin{Verbatim} + 0 1 2 3 + .-----------------------------------------------------------------. + | | | ######## | | | + | Version| IHL | #DSCP### | ECN| Total Length | + | | | ######## | | | + '-----------------------------------------------------------------' +\end{Verbatim} +\end{figure} +\noindent +This may look confusing at first, but keep in mind that bit- as well as +byte-ordering here is LSB while the mask value is written in MSB we humans use. +Therefore reading the mask is done like so, starting from left: +\begin{enumerate} +\item Skip the first byte (which contains Version and IHL fields). +\item Skip the lowest bit of the second byte (0x1e is even). +\item Mark the four following bits (0x1e is 11110 in binary). +\item Skip the remaining three bits of the second byte as well as the remaining two + bytes. +\end{enumerate} +Before doing the lookup, the kernel right-shifts the masked value by the amount +of zero-bits in \texttt{mask}, which implicitly also does the division by two which the +hash table depends on. With this setup, every packet has to pass exactly two +filters to be classified. Note that this filter is limited to IPv4 packets: Due +to the related Traffic Class field being at a different offset in the packet, it +would not work for IPv6. To use the same setup for IPv6 as well, a second +entry-level filter is necessary: +\begin{Verbatim} +# tc filter add dev eth0 parent 1: prio 2 protocol ipv6 u32 \ + link 1: hashkey mask 0x01e00000 match u8 0 0 +\end{Verbatim} +For illustration purposes, here again is a drawing of the first four bytes of +the IPv6 header, again with masked area highlighted: +\begin{figure}[H] +\begin{Verbatim} + 0 1 2 3 + .-----------------------------------------------------------------. + | | ######## | | + | Version| #Traffic Class| Flow Label | + | | ######## | | + '-----------------------------------------------------------------' +\end{Verbatim} +\end{figure} +\noindent +Reading the mask value is analogous to IPv4 with the added complexity that +Traffic Class spans over two bytes. Yet, for comparison there's a simple trick: +IPv6 has the interesting field shifted by four bits to the left, and the new +mask's value is shifted by the same amount. For further information about +\filter{u32} and what can be done with it, consult it's man page +\man{tc-u32(8)}. + +Of course, the kernel provides many more filters than just \filter{basic}, +\filter{flow} and \filter{u32} which have been presented above. As of now, the +remaining ones are: +\begin{description} +\item[bpf] + Filtering using Berkeley Packet Filter programs. The program's return + code determines the packet's destination class ID. + +\item[cgroup] + Filter packets based on control groups. This is only useful for packets + originating from the local host, as control groups only exist in that + scope. + +\item[flower] + An extended variant of the flow filter. + +\item[fw] + Matches on firewall mark values previously assigned to the packet by + netfilter (or a filter action, see below for details). This allows to + export the classification algorithm into netfilter, which is very + convenient if appropriate rules exist on the same system in there + already. + +\item[route] + Filter packets based on matching routing table entry. Basically + equivalent to the \texttt{fw} filter above, to make use of an already existing + extensive routing table setup. + +\item[rsvp, rsvp6] + Implementation of the Resource Reservation Protocol in Linux, to react + upon requests sent by an RSVP daemon. + +\item[tcindex] + Match packets based on tcindex value, which is usually set by the dsmark + qdisc. This is part of an approach to support Differentiated Services in + Linux, which is another topic on it's own. +\end{description} + + +\section*{Filter Actions} + +The tc filter framework provides the infrastructure to another extensible set of +tools as well, namely tc actions. As the name suggests, they allow to do things +with packets (or associated data). (The list of) Actions are part of a given +filter. If it matches, each action it contains is executed in order before +returning the classification result. Since the action has direct access to the +latter, it is in theory possible for an action to react upon or even change the +filtering result - as long as the packet matched, of course. Yet none of the +currently in-tree actions make use of this. + +The Generic Actions framework originally evolved out of the filters' ability to +police traffic to a given maximum bandwidth. One common use case for that is to +limit ingress traffic, dropping packets which exceed the threshold. A classic +setup example is like so: +\begin{Verbatim} +# tc qdisc add dev eth0 handle ffff: ingress +# tc filter add dev eth0 parent ffff: u32 \ + match u32 0 0 + police rate 1mbit burst 100k +\end{Verbatim} +The ingress qdisc is not a real one, but merely a point of reference for filters +to attach to which should get applied to incoming traffic. The \filter{u32} filter added +above matches on any packet and therefore limits the total incoming bandwidth to +1mbit/s, allowing bursts of up to 100kbytes. Using the new syntax, the filter +command changes slightly: +\begin{Verbatim} +# tc filter add dev eth0 parent ffff: u32 \ + match u32 0 0 \ + action police rate 1mbit burst 100k +\end{Verbatim} +The important detail is that this syntax allows to define multiple actions. +E.g. for testing purposes, it is possible to redirect exceeding traffic to the +loopback interface instead of dropping it: +\begin{Verbatim} +# tc filter add dev eth0 parent ffff: u32 \ + match u32 0 0 \ + action police rate 1mbit burst 100k conform-exceed pipe \ + action mirred egress redirect dev lo +\end{Verbatim} +The added parameter \texttt{conform-exceed pipe} tells the police action to allow for +further actions to handle the exceeding packet. + +Apart from \texttt{police} and \texttt{mirred} actions, there are a few more. Here's a full +list of the currently implemented ones: +\begin{description} +\item[bpf] + Apply a Berkeley Packet Filter program to the packet. + +\item[connmark] + Set the packet's firewall mark to that of it's connection. This works by + searching the conntrack table for a matching entry. If found, the mark + is restored. + +\item[csum] + Trigger recalculation of packet checksums. The supported protocols are: + IPv4, ICMP, IGMP, TCP, UDP and UDPLite. + +\item[ipt] + Pass the packet to an iptables target. This allows to use iptables + extensions directly instead of having to go the extra mile via setting + an arbitrary firewall mark and matching on that from within netfilter. + +\item[mirred] + Mirror or redirect packets. This is often combined with the ifb pseudo + device to share a common QoS setup between multiple interfaces or even + ingress traffic. + +\item[nat] + Perform stateless Native Address Translation. This is certainly not + complete and therefore inferior to NAT using iptables: Although the + kernel module decides between TCP, UDP and ICMP traffic, it does not + handle typical problematic protocols such as active FTP or SIP. + +\item[pedit] + Generic packet editing. This allows to alter arbitrary bytes of the + packet, either by specifying an offset into the packet or by naming a + packet header and field name to change. Currently, the latter is + implemented only for IPv4 yet. + +\item[police] + Apply a bandwidth rate limiting policy. Packets exceeding it are dropped + by default, but may optionally be handled differently. + +\item[simple] + This is rather an example than real action. All it does is print a + user-defined string together with a packet counter. Useful maybe for + debugging when filter statistics are not available or too complicated. + +\item[skbedit] + Edit associated packet data, supports changing queue mapping, priority + field and firewall mark value. + +\item[vlan] + Add/remove a VLAN header to/from the packet. This might serve as + alternative to using 802.1Q pseudo-interfaces in combination with + routing rules when e.g. packets for a given destination need to be + encapsulated. +\end{description} + + +\section*{Intermediate Functional Block} + +The Intermediate Functional Block (\texttt{ifb}) pseudo network interface acts as a QoS +concentrator for multiple different sources of traffic. Packets from or to other +interfaces have to be redirected to it using the \texttt{mirred} action in order to be +handled, regularly routed traffic will be dropped. This way, a single stack of +qdiscs, classes and filters can be shared between multiple interfaces. + +Here's a simple example to feed incoming traffic from multiple interfaces +through a Stochastic Fairness Queue (\qdisc{sfq}): +\begin{Verbatim} +(1) # modprobe ifb +(2) # ip link set ifb0 up +(3) # tc qdisc add dev ifb0 root sfq +\end{Verbatim} +The first step is to load the \texttt{ifb} kernel module (1). By default, this will +create two ifb devices: \iface{ifb0} and \iface{ifb1}. After setting +\iface{ifb0} up in (2), the root +qdisc is replaced by \qdisc{sfq} in (3). Finally, one can start redirecting ingress +traffic to \iface{ifb0}, e.g. from \iface{eth0}: +\begin{Verbatim} +# tc qdisc add dev eth0 handle ffff: ingress +# tc filter add dev eth0 parent ffff: u32 \ + match u32 0 0 \ + action mirred egress redirect dev ifb0 +\end{Verbatim} +The same can be done for other interfaces, just replacing \iface{eth0} in the two +commands above. One thing to keep in mind here is the asymmetrical routing this +creates within the host doing the QoS: Incoming packets enter the system via +\iface{ifb0}, while corresponding replies leave directly via \iface{eth0}. This can be observed +using \cmd{tcpdump} on \iface{ifb0}, which shows the input part of the traffic only. What's +more confusing is that \cmd{tcpdump} on \iface{eth0} shows both incoming and outgoing traffic, +but the redirection is still effective - a simple prove is setting +\iface{ifb0} down, +which will interrupt the communication. Obviously \cmd{tcpdump} catches the packets to +dump before they enter the ingress qdisc, which is why it sees them while the +kernel itself doesn't. + + +\section*{Conclusion} + +My personal impression is that although the \cmd{tc} utility is an absolute +necessity for anyone aiming at doing QoS in Linux professionally, there are way +too many loose ends and trip wires present in it's environment. Contributing to +this is the fact, that much of the non-essential functionality is redundantly +available in netfilter. Another problem which adds weight to the first one is a +general lack of documentation. Of course, there are many HOWTOs and guides in +the internet, but since it's often not clear how up to date these are, I prefer +the usual resources such as man or info pages. Surely nothing one couldn't fix +in hindsight, but quality certainly suffers if the original author of the code +does not or can not contribute to that. + +All that being said, once the steep learning curve has been mastered, the +conglomerate of (classful) qdiscs, filters and actions provides a highly +sophisticated and flexible infrastructure to perform QoS, which plays nicely +along with routing and firewalling setups. + + +\section*{Further Reading} + +A good starting point for novice users and experienced ones diving into unknown +areas is the extensive HOWTO at \url{http://lartc.org}. The iproute2 package ships +some examples (usually in /usr/share/doc/, depending on distribution) as well as +man pages for \cmd{tc} in general, qdiscs and filters. The latter have been added +just recently though, so if your distribution does not ship iproute2 version +4.3.0 yet, these are not in there. Apart from that, the internet is a spring of +HOWTOs and scripts people wrote - though these should be taken with a grain of +salt: The complexity of the matter often leads to copying others' solutions +without much validation, which allows for less optimal or even obsolete +implementations to survive much longer than desired. + +\end{document} diff --git a/ip/ip.c b/ip/ip.c index eea00b82..123f1813 100644 --- a/ip/ip.c +++ b/ip/ip.c @@ -49,7 +49,7 @@ static void usage(void) fprintf(stderr, "Usage: ip [ OPTIONS ] OBJECT { COMMAND | help }\n" " ip [ -force ] -batch filename\n" -"where OBJECT := { link | address | addrlabel | route | rule | neighbor | ntable |\n" +"where OBJECT := { link | address | addrlabel | route | rule | neigh | ntable |\n" " tunnel | tuntap | maddress | mroute | mrule | monitor | xfrm |\n" " netns | l2tp | fou | tcp_metrics | token | netconf }\n" " OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n" diff --git a/ip/ipaddrlabel.c b/ip/ipaddrlabel.c index f01bc269..ef093cbe 100644 --- a/ip/ipaddrlabel.c +++ b/ip/ipaddrlabel.c @@ -49,7 +49,8 @@ static void usage(void) __attribute__((noreturn)); static void usage(void) { - fprintf(stderr, "Usage: ip addrlabel [ list | add | del | flush ] prefix PREFIX [ dev DEV ] [ label LABEL ]\n"); + fprintf(stderr, "Usage: ip addrlabel { add | del } prefix PREFIX [ dev DEV ] [ label LABEL ]\n"); + fprintf(stderr, " ip addrlabel [ list | flush | help ]\n"); exit(-1); } diff --git a/ip/iplink.c b/ip/iplink.c index 69f50572..33d7c0ad 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -70,17 +70,16 @@ void iplink_usage(void) fprintf(stderr, " [ address LLADDR ]\n"); fprintf(stderr, " [ broadcast LLADDR ]\n"); fprintf(stderr, " [ mtu MTU ]\n"); - fprintf(stderr, " [ netns PID ]\n"); - fprintf(stderr, " [ netns NAME ]\n"); + fprintf(stderr, " [ netns { PID | NAME } ]\n"); fprintf(stderr, " [ link-netnsid ID ]\n"); fprintf(stderr, " [ alias NAME ]\n"); fprintf(stderr, " [ vf NUM [ mac LLADDR ]\n"); fprintf(stderr, " [ vlan VLANID [ qos VLAN-QOS ] ]\n"); - fprintf(stderr, " [ rate TXRATE ] ]\n"); + fprintf(stderr, " [ rate TXRATE ]\n"); - fprintf(stderr, " [ spoofchk { on | off} ] ]\n"); - fprintf(stderr, " [ query_rss { on | off} ] ]\n"); + fprintf(stderr, " [ spoofchk { on | off} ]\n"); + fprintf(stderr, " [ query_rss { on | off} ]\n"); fprintf(stderr, " [ state { auto | enable | disable} ] ]\n"); fprintf(stderr, " [ trust { on | off} ] ]\n"); fprintf(stderr, " [ master DEVICE ]\n"); diff --git a/ip/ipneigh.c b/ip/ipneigh.c index 92b7cd6f..48cca196 100644 --- a/ip/ipneigh.c +++ b/ip/ipneigh.c @@ -46,10 +46,11 @@ static void usage(void) __attribute__((noreturn)); static void usage(void) { - fprintf(stderr, "Usage: ip neigh { add | del | change | replace } { ADDR [ lladdr LLADDR ]\n" - " [ nud { permanent | noarp | stale | reachable } ]\n" - " | proxy ADDR } [ dev DEV ]\n"); - fprintf(stderr, " ip neigh {show|flush} [ to PREFIX ] [ dev DEV ] [ nud STATE ]\n"); + fprintf(stderr, "Usage: ip neigh { add | del | change | replace }\n" + " { ADDR [ lladdr LLADDR ] [ nud STATE ] | proxy ADDR } [ dev DEV ]\n"); + fprintf(stderr, " ip neigh { show | flush } [ proxy ] [ to PREFIX ] [ dev DEV ] [ nud STATE ]\n\n"); + fprintf(stderr, "STATE := { permanent | noarp | stale | reachable | none |\n" + " incomplete | delay | probe | failed }\n"); exit(-1); } diff --git a/ip/ipntable.c b/ip/ipntable.c index 6eb84e79..2763570a 100644 --- a/ip/ipntable.c +++ b/ip/ipntable.c @@ -52,7 +52,7 @@ static void usage(void) "PARMS := [ base_reachable MSEC ] [ retrans MSEC ] [ gc_stale MSEC ]\n" " [ delay_probe MSEC ] [ queue LEN ]\n" - " [ app_probs VAL ] [ ucast_probes VAL ] [ mcast_probes VAL ]\n" + " [ app_probes VAL ] [ ucast_probes VAL ] [ mcast_probes VAL ]\n" " [ anycast_delay MSEC ] [ proxy_delay MSEC ] [ proxy_queue LEN ]\n" " [ locktime MSEC ]\n" ); diff --git a/ip/iproute.c b/ip/iproute.c index 051fc12d..5b954478 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -82,13 +82,13 @@ static void usage(void) fprintf(stderr, "FAMILY := [ inet | inet6 | ipx | dnet | mpls | bridge | link ]\n"); fprintf(stderr, "OPTIONS := FLAGS [ mtu NUMBER ] [ advmss NUMBER ] [ as [ to ] ADDRESS ]\n"); fprintf(stderr, " [ rtt TIME ] [ rttvar TIME ] [ reordering NUMBER ]\n"); - fprintf(stderr, " [ window NUMBER] [ cwnd NUMBER ] [ initcwnd NUMBER ]\n"); + fprintf(stderr, " [ window NUMBER ] [ cwnd NUMBER ] [ initcwnd NUMBER ]\n"); fprintf(stderr, " [ ssthresh NUMBER ] [ realms REALM ] [ src ADDRESS ]\n"); fprintf(stderr, " [ rto_min TIME ] [ hoplimit NUMBER ] [ initrwnd NUMBER ]\n"); fprintf(stderr, " [ features FEATURES ] [ quickack BOOL ] [ congctl NAME ]\n"); fprintf(stderr, " [ pref PREF ] [ expires TIME ]\n"); - fprintf(stderr, "TYPE := [ unicast | local | broadcast | multicast | throw |\n"); - fprintf(stderr, " unreachable | prohibit | blackhole | nat ]\n"); + fprintf(stderr, "TYPE := { unicast | local | broadcast | multicast | throw |\n"); + fprintf(stderr, " unreachable | prohibit | blackhole | nat }\n"); fprintf(stderr, "TABLE_ID := [ local | main | default | all | NUMBER ]\n"); fprintf(stderr, "SCOPE := [ host | link | global | NUMBER ]\n"); fprintf(stderr, "NHFLAGS := [ onlink | pervasive ]\n"); diff --git a/ip/iprule.c b/ip/iprule.c index 33b71976..7e3b38b6 100644 --- a/ip/iprule.c +++ b/ip/iprule.c @@ -39,6 +39,7 @@ static void usage(void) fprintf(stderr, "SELECTOR := [ not ] [ from PREFIX ] [ to PREFIX ] [ tos TOS ] [ fwmark FWMARK[/MASK] ]\n"); fprintf(stderr, " [ iif STRING ] [ oif STRING ] [ pref NUMBER ]\n"); fprintf(stderr, "ACTION := [ table TABLE_ID ]\n"); + fprintf(stderr, " [ nat ADDRESS ]\n"); fprintf(stderr, " [ realms [SRCREALM/]DSTREALM ]\n"); fprintf(stderr, " [ goto NUMBER ]\n"); fprintf(stderr, " SUPPRESSOR\n"); diff --git a/man/man8/Makefile b/man/man8/Makefile index 2f776406..d3fdf66a 100644 --- a/man/man8/Makefile +++ b/man/man8/Makefile @@ -14,7 +14,9 @@ MAN8PAGES = $(TARGETS) ip.8 arpd.8 lnstat.8 routel.8 rtacct.8 rtmon.8 rtpr.8 ss. tipc.8 tipc-bearer.8 tipc-link.8 tipc-media.8 tipc-nametable.8 \ tipc-node.8 tipc-socket.8 \ tc-basic.8 tc-cgroup.8 tc-flow.8 tc-flower.8 tc-fw.8 tc-route.8 \ - tc-tcindex.8 tc-u32.8 + tc-tcindex.8 tc-u32.8 \ + tc-connmark.8 tc-csum.8 tc-mirred.8 tc-nat.8 tc-pedit.8 tc-police.8 \ + tc-simple.8 tc-skbedit.8 tc-vlan.8 tc-xt.8 all: $(TARGETS) diff --git a/man/man8/ip-address.8.in b/man/man8/ip-address.8.in index 159d9065..ff3fe0b9 100644 --- a/man/man8/ip-address.8.in +++ b/man/man8/ip-address.8.in @@ -58,9 +58,9 @@ ip-address \- protocol address management .ti -8 .IR FLAG " := " -.RB "[ " permanent " | " dynamic " | " secondary " | " primary " | \ -[ - ] " tentative " | [ - ] " deprecated " | [ - ] " dadfailed " | "\ -temporary " | " CONFFLAG-LIST " ]" +.RB "[ " permanent " | " dynamic " | " secondary " | " primary " |" +.RB [ - ] tentative " | [" - ] deprecated " | [" - ] dadfailed " |" +.BR temporary " | " CONFFLAG-LIST " ]" .ti -8 .IR CONFFLAG-LIST " := [ " CONFFLAG-LIST " ] " CONFFLAG @@ -72,7 +72,7 @@ temporary " | " CONFFLAG-LIST " ]" .ti -8 .IR LIFETIME " := [ " .BI valid_lft " LFT" -.RB "| " preferred_lft +.RB "] [ " preferred_lft .IR LFT " ]" .ti -8 diff --git a/man/man8/ip-addrlabel.8 b/man/man8/ip-addrlabel.8 index 51ef5727..233d6067 100644 --- a/man/man8/ip-addrlabel.8 +++ b/man/man8/ip-addrlabel.8 @@ -6,21 +6,9 @@ ip-addrlabel \- protocol address label management .ad l .in +8 .ti -8 -.B ip -.RI "[ " OPTIONS " ]" -.B addrlabel +.B ip addrlabel .RI " { " COMMAND " | " .BR help " }" -.sp - -.ti -8 -.IR OPTIONS " := { " -\fB\-V\fR[\fIersion\fR] | -\fB\-s\fR[\fItatistics\fR] | -\fB\-r\fR[\fIesolve\fR] | -\fB\-f\fR[\fIamily\fR] { -.BR inet " | " inet6 " | " ipx " | " dnet " | " link " } | " -\fB\-o\fR[\fIneline\fR] } .ti -8 .BR "ip addrlabel" " { " add " | " del " } " prefix diff --git a/man/man8/ip-l2tp.8 b/man/man8/ip-l2tp.8 index 1738035f..5b7041f9 100644 --- a/man/man8/ip-l2tp.8 +++ b/man/man8/ip-l2tp.8 @@ -15,10 +15,7 @@ ip-l2tp - L2TPv3 static unmanaged tunnel configuration .ti -8 .BR "ip l2tp add tunnel" .br -.B remote -.RI "[ " ADDR " ]" -.B local -.RI "[ " ADDR " ]" +.BI remote " ADDR " local " ADDR " .br .B tunnel_id .IR ID @@ -73,24 +70,21 @@ ip-l2tp - L2TPv3 static unmanaged tunnel configuration .IR ID .br .ti -8 -.BR "ip l2tp show tunnel" -.B "[" tunnel_id -.IR ID -.B "]" +.BR "ip l2tp show tunnel" " [ " tunnel_id +.IR ID " ]" .br .ti -8 -.BR "ip l2tp show session" -.B "[" tunnel_id -.IR ID -.B "] [" session_id -.IR ID -.B "]" +.BR "ip l2tp show session" " [ " tunnel_id +.IR ID .B " ] [" +.B session_id +.IR ID " ]" .br .ti -8 .IR NAME " := " .IR STRING .ti -8 -.IR ADDR " := { " IP_ADDRESS " }" +.IR ADDR " := { " IP_ADDRESS " |" +.BR any " }" .ti -8 .IR PORT " := { " NUMBER " }" .ti -8 diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 221831e5..2cd93b0f 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -6,24 +6,11 @@ ip-link \- network device configuration .ad l .in +8 .ti -8 -.B ip -.RI "[ " OPTIONS " ]" -.B link +.B ip link .RI " { " COMMAND " | " .BR help " }" .sp -.ti -8 -.IR OPTIONS " := { " -\fB\-V\fR[\fIersion\fR] | -\fB\-h\fR[\fIuman-readable\fR] | -\fB\-s\fR[\fItatistics\fR] | -\fB\-r\fR[\fIesolve\fR] | -\fB\-f\fR[\fIamily\fR] { -.BR inet " | " inet6 " | " ipx " | " dnet " | " link " } | " -\fB\-o\fR[\fIneline\fR] | -\fB\-br\fR[\fIief\fR] } - .ti -8 .BI "ip link add" .RB "[ " link @@ -49,7 +36,7 @@ ip-link \- network device configuration .RB "[ " numrxqueues .IR QUEUE_COUNT " ]" .br -.BR type " TYPE" +.BI type " TYPE" .RI "[ " ARGS " ]" .ti -8 @@ -92,79 +79,89 @@ ip-link \- network device configuration .BR "ip link set " { .IR DEVICE " | " .BI "group " GROUP -.RB "} { " up " | " down " | " arp " { " on " | " off " } |" +.RB "} [ { " up " | " down " } ]" .br -.BR promisc " { " on " | " off " } |" +.RB "[ " arp " { " on " | " off " } ]" .br -.BR allmulticast " { " on " | " off " } |" +.RB "[ " dynamic " { " on " | " off " } ]" .br -.BR dynamic " { " on " | " off " } |" +.RB "[ " multicast " { " on " | " off " } ]" .br -.BR multicast " { " on " | " off " } |" +.RB "[ " allmulticast " { " on " | " off " } ]" .br -.BR protodown " { " on " | " off " } |" +.RB "[ " promisc " { " on " | " off " } ]" .br -.B txqueuelen -.IR PACKETS " |" +.RB "[ " protodown " { " on " | " off " } ]" .br -.B name -.IR NEWNAME " |" +.RB "[ " trailers " { " on " | " off " } ]" .br -.B address -.IR LLADDR " |" -.B broadcast -.IR LLADDR " |" +.RB "[ " txqueuelen +.IR PACKETS " ]" .br -.B mtu -.IR MTU " |" +.RB "[ " name +.IR NEWNAME " ]" .br -.B netns -.IR PID " |" +.RB "[ " address +.IR LLADDR " ]" .br -.B netns -.IR NETNSNAME " |" +.RB "[ " broadcast +.IR LLADDR " ]" .br -.B alias -.IR NAME " |" +.RB "[ " mtu +.IR MTU " ]" .br -.B vf +.RB "[ " netns " {" +.IR PID " | " NETNSNAME " } ]" +.br +.RB "[ " link-netnsid +.IR ID " ]" +.br +.RB "[ " alias +.IR NAME " ]" +.br +.RB "[ " vf .IR NUM " [" .B mac -.IR LLADDR " ] [" -.B vlan +.IR LLADDR " ]" +.br +.in +9 +.RB "[ " vlan .IR VLANID " [ " .B qos -.IR VLAN-QOS " ] ] [" -.B rate -.IR TXRATE " ] [" -.B max_tx_rate -.IR TXRATE " ] [" -.B min_tx_rate -.IR TXRATE " ] [" -.B spoofchk { on | off } ] [ -.B state { auto | enable | disable} ] [ -.B trust { on | off } -] | +.IR VLAN-QOS " ] ]" .br -.B master -.IR DEVICE " |" +.RB "[ " rate +.IR TXRATE " ]" .br -.B nomaster " |" +.RB "[ " max_tx_rate +.IR TXRATE " ]" .br -.B addrgenmode { eui64 | none | stable_secret | random } +.RB "[ " min_tx_rate +.IR TXRATE " ]" .br -.B link-netnsid ID -.BR " }" +.RB "[ " spoofchk " { " on " | " off " } ]" +.br +.RB "[ " state " { " auto " | " enable " | " disable " } ]" +.br +.RB "[ " trust " { " on " | " off " } ] ]" +.br +.in -9 +.RB "[ " master +.IR DEVICE " ]" +.br +.RB "[ " nomaster " ]" +.br +.RB "[ " addrgenmode " { " eui64 " | " none " | " stable_secret " | " random " } ]" .ti -8 .B ip link show .RI "[ " DEVICE " | " .B group -.IR GROUP " | " -.BR up " | " +.IR GROUP " ] [" +.BR up " ] [" .B master -.IR DEVICE " | " +.IR DEVICE " ] [" .B type .IR TYPE " ]" @@ -494,15 +491,15 @@ are entered into the VXLAN device forwarding database. .sp .I [no]udpcsum -- specifies if UDP checksum is filled in +- specifies if UDP checksum is calculated for transmitted packets over IPv4. .sp .I [no]udp6zerocsumtx -- specifies if UDP checksum is filled in +- skip UDP checksum calculation for transmitted packets over IPv6. .sp .I [no]udp6zerocsumrx -- specifies if UDP checksum is received +- allow incoming UDP packets over IPv6 with zero checksum field. .sp .BI ageing " SECONDS" diff --git a/man/man8/ip-monitor.8 b/man/man8/ip-monitor.8 index d2bd381a..86f8f988 100644 --- a/man/man8/ip-monitor.8 +++ b/man/man8/ip-monitor.8 @@ -6,9 +6,7 @@ ip-monitor, rtmon \- state monitoring .ad l .in +8 .ti -8 -.BR "ip " " [ " -.IR ip-OPTIONS " ]" -.BR "monitor" " [ " all " |" +.BR "ip monitor" " [ " all " |" .IR OBJECT-LIST " ] [" .BI file " FILENAME " ] [ diff --git a/man/man8/ip-mroute.8 b/man/man8/ip-mroute.8 index e89b6b2d..b64e30d3 100644 --- a/man/man8/ip-mroute.8 +++ b/man/man8/ip-mroute.8 @@ -6,7 +6,7 @@ ip-mroute \- multicast routing cache management .ad l .in +8 .ti -8 -.BR "ip " " [ ip-OPTIONS ] " "mroute show" " [ [ " +.BR "ip mroute show" " [ [ " .BR " to " " ] " .IR PREFIX " ] [ " .B from diff --git a/man/man8/ip-neighbour.8 b/man/man8/ip-neighbour.8 index c9b0256e..b292e181 100644 --- a/man/man8/ip-neighbour.8 +++ b/man/man8/ip-neighbour.8 @@ -18,7 +18,9 @@ ip-neighbour \- neighbour/arp tables management. .IR ADDR " [ " .B lladdr .IR LLADDR " ] [ " -.BR nud " { " permanent " | " noarp " | " stale " | " reachable " } ] | " proxy +.B nud +.IR STATE " ] |" +.B proxy .IR ADDR " } [ " .B dev .IR DEV " ]" @@ -31,6 +33,10 @@ ip-neighbour \- neighbour/arp tables management. .B nud .IR STATE " ]" +.ti -8 +.IR STATE " := {" +.BR permanent " | " noarp " | " stale " | " reachable " | " none " |" +.BR incomplete " | " delay " | " probe " | " failed " }" .SH DESCRIPTION The @@ -75,12 +81,13 @@ can also be .BR "null" . .TP -.BI nud " NUD_STATE" +.BI nud " STATE" the state of the neighbour entry. .B nud is an abbreviation for 'Neighbour Unreachability Detection'. The state can take one of the following values: +.RS .TP .B permanent the neighbour entry is valid forever and can be only @@ -100,6 +107,24 @@ This option to .B ip neigh does not change the neighbour state if it was valid and the address is not changed by this command. +.TP +.B none +this is a pseudo state used when initially creating a neighbour entry or after +trying to remove it before it becomes free to do so. +.TP +.B incomplete +the neighbour entry has not (yet) been validated/resolved. +.TP +.B delay +neighbor entry validation is currently delayed. +.TP +.B probe +neighbor is being probed. +.TP +.B failed +max number of probes exceeded without success, neighbor validation has +ultimately failed. +.RE .RE .TP @@ -147,7 +172,7 @@ list neighbour proxies. only list neighbours which are not currently in use. .TP -.BI nud " NUD_STATE" +.BI nud " STATE" only list neighbour entries in this state. .I NUD_STATE takes values listed below or the special value diff --git a/man/man8/ip-netns.8 b/man/man8/ip-netns.8 index c9b0fbc2..c5310e24 100644 --- a/man/man8/ip-netns.8 +++ b/man/man8/ip-netns.8 @@ -13,7 +13,7 @@ ip-netns \- process network namespace management .BR help " }" .sp .ti -8 -.BR "ip netns" " { " list " } " +.BR "ip netns" " [ " list " ]" .ti -8 .B ip netns add @@ -24,7 +24,7 @@ ip-netns \- process network namespace management .RI "[ " NETNSNAME " ]" .ti -8 -.BR "ip netns" " { " set " } " +.B ip netns set .I NETNSNAME NETNSID .ti -8 diff --git a/man/man8/ip-ntable.8 b/man/man8/ip-ntable.8 index 462e5896..4f0f2e54 100644 --- a/man/man8/ip-ntable.8 +++ b/man/man8/ip-ntable.8 @@ -8,7 +8,7 @@ ip-ntable - neighbour table configuration .ti -8 .B ip .RI "[ " OPTIONS " ]" -.B address +.B ntable .RI " { " COMMAND " | " .BR help " }" .sp @@ -17,34 +17,39 @@ ip-ntable - neighbour table configuration .BR "ip ntable change name" .IR NAME " [ " .B dev -.IR DEV " ] " PARMS - -.ti -8 -.IR PARMS " := { " +.IR DEV " ] [" .B thresh1 -.IR VAL " | " +.IR VAL " ] [" .B thresh2 -.IR VAL " | " +.IR VAL " ] [" .B thresh3 -.IR VAL " | " +.IR VAL " ] [" .B gc_int -.IR MSEC " | " +.IR MSEC " ] [" .B base_reachable -.IR MSEC " | " +.IR MSEC " ] [" .B retrans -.IR MSEC " | " "gc_stale MSEC " " | " +.IR MSEC " ] [" +.B gc_stale +.IR MSEC " ] [" .B delay_probe -.IR MSEC " | " "queue LEN " " | " +.IR MSEC " ] [" +.B queue +.IR LEN " ] [" .B app_probs -.IR VAL " | " +.IR VAL " ] [" .B ucast_probes -.IR VAL " | " "mcast_probes VAL " " | " +.IR VAL " ] [" +.B mcast_probes +.IR VAL " ] [" .B anycast_delay -.IR MSEC " | " +.IR MSEC " ] [" .B proxy_delay -.IR MSEC " | " "proxy_queue LEN " " | " +.IR MSEC " ] [" +.B proxy_queue +.IR LEN " ] [" .B locktime -.IR MSEC " }" +.IR MSEC " ]" .ti -8 .BR "ip ntable show" " [ " diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in index c764bfc8..d7fb8fba 100644 --- a/man/man8/ip-route.8.in +++ b/man/man8/ip-route.8.in @@ -16,7 +16,7 @@ ip-route \- routing table management .ti -8 .BR "ip route" " { " -.BR list " | " flush " } " +.BR show " | " flush " } " .I SELECTOR .ti -8 diff --git a/man/man8/ip-rule.8 b/man/man8/ip-rule.8 index b7008c6a..1774ae3e 100644 --- a/man/man8/ip-rule.8 +++ b/man/man8/ip-rule.8 @@ -9,20 +9,26 @@ ip-rule \- routing policy database management .B ip .RI "[ " OPTIONS " ]" .B rule -.RI " { " COMMAND " | " +.RI "{ " COMMAND " | " .BR help " }" .sp .ti -8 .B ip rule -.RB " [ " list " | " add " | " del " | " flush " | " save " ]" +.RB "[ " list " ]" + +.ti -8 +.B ip rule +.RB "{ " add " | " del " }" .I SELECTOR ACTION .ti -8 -.B ip rule " restore " +.B ip rule +.RB "{ " flush " | " save " | " restore " }" .ti -8 .IR SELECTOR " := [ " +.BR not " ] [" .B from .IR PREFIX " ] [ " .B to @@ -30,7 +36,7 @@ ip-rule \- routing policy database management .B tos .IR TOS " ] [ " .B fwmark -.IR FWMARK[/MASK] " ] [ " +.IR FWMARK\fR[\fB/\fIMASK "] ] [ " .B iif .IR STRING " ] [ " .B oif @@ -45,8 +51,9 @@ ip-rule \- routing policy database management .B nat .IR ADDRESS " ] [ " .B realms -.RI "[" SRCREALM "/]" DSTREALM " ]" -.I SUPPRESSOR +.RI "[" SRCREALM "\fB/\fR]" DSTREALM " ] [" +.B goto +.IR NUMBER " ] " SUPPRESSOR .ti -8 .IR SUPPRESSOR " := [ " @@ -111,8 +118,6 @@ The .B local table is a special routing table containing high priority control routes for local and broadcast addresses. -.sp -Rule 0 is special. It cannot be deleted or overridden. .TP 2. diff --git a/man/man8/ip-token.8 b/man/man8/ip-token.8 index 35a3d1e3..260f366a 100644 --- a/man/man8/ip-token.8 +++ b/man/man8/ip-token.8 @@ -7,23 +7,23 @@ ip-token \- tokenized interface identifier support .in +8 .ti -8 .B ip token -.RI " { " COMMAND " | " +.RI "{ " COMMAND " | " .BR help " }" .sp .ti -8 -.BR "ip token" " { " set " } " +.B ip token set .IR TOKEN .B dev .IR DEV .ti -8 -.BR "ip token" " { " get " } " -.B dev -.IR DEV +.B ip token get +.RB "[ " dev +.IR DEV " ]" .ti -8 -.BR "ip token" " { " list " }" +.BR "ip token" " [ " list " ]" .SH "DESCRIPTION" IPv6 tokenized interface identifier support is used for assigning well-known diff --git a/man/man8/ip-tunnel.8 b/man/man8/ip-tunnel.8 index 8b746cb0..4938c740 100644 --- a/man/man8/ip-tunnel.8 +++ b/man/man8/ip-tunnel.8 @@ -11,7 +11,7 @@ ip-tunnel - tunnel configuration .ti -8 .BR "ip " .RI "[ " OPTIONS " ]" -.BR "tunnel" " { " add " | " change " | " del " | " show " | " prl " }" +.BR "tunnel" " { " add " | " change " | " del " | " show " | " prl " | " 6rd " }" .RI "[ " NAME " ]" .br .RB "[ " mode @@ -42,6 +42,12 @@ ip-tunnel - tunnel configuration .B prl-delete .IR ADDR " ]" .br +.RB "[ " 6rd-prefix +.IR ADDR " ] [" +.B 6rd-relay_prefix +.IR ADDR " ] [ +.BR 6rd-reset " ]" +.br .RB "[ [" no "]" pmtudisc " ]" .RB "[ " dev .IR PHYS_DEV " ]" @@ -75,9 +81,6 @@ ip-tunnel - tunnel configuration .ti -8 .IR KEY " := { " DOTTED_QUAD " | " NUMBER " }" -.ti -8 -.IR TIME " := " NUMBER "[s|ms]" - .SH DESCRIPTION .B tunnel objects are tunnels, encapsulating packets in IP packets and then diff --git a/man/man8/ip-xfrm.8 b/man/man8/ip-xfrm.8 index dae07288..11f71047 100644 --- a/man/man8/ip-xfrm.8 +++ b/man/man8/ip-xfrm.8 @@ -57,6 +57,8 @@ ip-xfrm \- transform configuration .IR ADDR "[/" PLEN "] ]" .RB "[ " ctx .IR CTX " ]" +.RB "[ " extra-flag +.IR EXTRA-FLAG-LIST " ]" .ti -8 .B "ip xfrm state allocspi" @@ -195,6 +197,13 @@ ip-xfrm \- transform configuration .RB "{ " espinudp " | " espinudp-nonike " }" .IR SPORT " " DPORT " " OADDR +.ti -8 +.IR EXTRA-FLAG-LIST " := [ " EXTRA-FLAG-LIST " ] " EXTRA-FLAG + +.ti -8 +.IR EXTRA-FLAG " := " +.B dont-encap-dscp + .ti -8 .BR "ip xfrm policy" " { " add " | " update " }" .I SELECTOR @@ -247,6 +256,8 @@ ip-xfrm \- transform configuration .IR ACTION " ]" .RB "[ " priority .IR PRIORITY " ]" +.RB "[ " flag +.IR FLAG-LIST "]" .ti -8 .B "ip xfrm policy flush" diff --git a/man/man8/ip.8 b/man/man8/ip.8 index b1f69073..aa2bc68c 100644 --- a/man/man8/ip.8 +++ b/man/man8/ip.8 @@ -21,7 +21,7 @@ ip \- show / manipulate routing, devices, policy routing and tunnels .IR OBJECT " := { " .BR link " | " address " | " addrlabel " | " route " | " rule " | " neigh " | "\ ntable " | " tunnel " | " tuntap " | " maddress " | " mroute " | " mrule " | "\ - monitor " | " xfrm " | " netns " | " l2tp " | " tcp_metrics " }" + monitor " | " xfrm " | " netns " | " l2tp " | " tcp_metrics " | " token " }" .sp .ti -8 @@ -29,10 +29,22 @@ ip \- show / manipulate routing, devices, policy routing and tunnels \fB\-V\fR[\fIersion\fR] | \fB\-h\fR[\fIuman-readable\fR] | \fB\-s\fR[\fItatistics\fR] | +\fB\-d\fR[\fIetails\fR] | \fB\-r\fR[\fIesolve\fR] | +\fB\-iec\fR | \fB\-f\fR[\fIamily\fR] { .BR inet " | " inet6 " | " ipx " | " dnet " | " link " } | " +\fB-4\fR | +\fB-6\fR | +\fB-I\fR | +\fB-D\fR | +\fB-B\fR | +\fB-0\fR | +\fB-l\fR[\fIoops\fR] { \fBmaximum-addr-flush-attempts\fR } | \fB\-o\fR[\fIneline\fR] | +\fB\-rc\fR[\fIvbuf\fR] [\fBsize\fR] | +\fB\-t\fR[\fIimestamp\fR] | +\fB\-ts\fR[\fIhort\fR] | \fB\-n\fR[\fIetns\fR] name | \fB\-a\fR[\fIll\fR] | \fB\-c\fR[\fIolor\fR] } @@ -179,6 +191,16 @@ Use color output. .BR "\-t" , " \-timestamp" display current time when using monitor option. +.TP +.BR "\-ts" , " \-tshort" +Like +.BR \-timestamp , +but use shorter format. + +.TP +.BR "\-rc" , " \-rcvbuf" +Set the netlink socket receive buffer size, defaults to 1MB. + .SH IP - COMMAND SYNTAX .SS @@ -240,6 +262,10 @@ display current time when using monitor option. .B tcp_metrics/tcpmetrics - manage TCP Metrics +.TP +.B token +- manage tokenized interface identifiers. + .TP .B tunnel - tunnel over IP. @@ -305,6 +331,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2. .BR ip-route (8), .BR ip-rule (8), .BR ip-tcp_metrics (8), +.BR ip-token (8), .BR ip-tunnel (8), .BR ip-xfrm (8) .br diff --git a/man/man8/tc-connmark.8 b/man/man8/tc-connmark.8 new file mode 100644 index 00000000..bb4cf754 --- /dev/null +++ b/man/man8/tc-connmark.8 @@ -0,0 +1,55 @@ +.TH "Connmark retriever action in tc" 8 "11 Jan 2016" "iproute2" "Linux" + +.SH NAME +connmark - netfilter connmark retriever action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action connmark " [ " zone" +.IR u16_zone_index " ] [ " BRANCH " ] [" +.BI index " u32_index " +] + +.ti -8 +.IR BRANCH " := { " reclassify " | " pipe " | " drop " | " continue " | " ok " }" +.SH DESCRIPTION +The connmark action is used to restore the connection's mark value into the +packet's fwmark. +.SH OPTIONS +.TP +.BI zone " u16_zone_index" +Specify the conntrack zone when doing conntrack lookups for packets. +.I u16_zone_index +is a 16bit unsigned decimal value. +.TP +.I BRANCH +How to continue after executing this action. +.RS +.TP +.B reclassify +Restarts classification by jumping back to the first filter attached to this +action's parent. +.TP +.B pipe +Continue with the next action, this is the default. +.TP +.B drop +.TQ +.B shot +Packet will be dropped without running further actions. +.TP +.B continue +Continue classification with next filter in line. +.TP +.B pass +Return to calling qdisc for packet processing. This ends the classification +process. +.RE +.TP +.BI index " u32_index " +Specify an index for this action in order to being able to identify it in later +commands. +.I u32_index +is a 32bit unsigned decimal value. +.SH SEE ALSO +.BR tc (8) diff --git a/man/man8/tc-csum.8 b/man/man8/tc-csum.8 new file mode 100644 index 00000000..9d00aae3 --- /dev/null +++ b/man/man8/tc-csum.8 @@ -0,0 +1,54 @@ +.TH "Checksum action in tc" 8 "11 Jan 2015" "iproute2" "Linux" + +.SH NAME +csum - checksum update action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action csum" +.I UPDATE + +.ti -8 +.IR UPDATE " := " TARGET " [ " UPDATE " ]" + +.ti -8 +.IR TARGET " := { " +.BR ip4h " |" +.BR icmp " |" +.BR igmp " |" +.BR tcp " |" +.BR udp " |" +.BR udplite " |" +.IR SWEETS " }" + +.ti -8 +.IR SWEETS " := { " +.BR and " | " or " | " + " }" +.SH DESCRIPTION +The +.B csum +action triggers checksum recalculation of specified packet headers. It is +commonly used after packet editing using the +.B pedit +action to fix for then incorrect checksums. +.SH OPTIONS +.TP +.I TARGET +Specify which headers to update: IPv4 header +.RB ( ip4h ), +ICMP header +.RB ( icmp ), +IGMP header +.RB ( igmp ), +TCP header +.RB ( tcp ), +UDP header +.RB ( udp ") or" +UDPLite header +.RB ( udplite ). +.TP +.B SWEETS +These are merely syntactic sugar and ignored internally. +.SH SEE ALSO +.BR tc (8), +.BR tc-pedit (8) diff --git a/man/man8/tc-mirred.8 b/man/man8/tc-mirred.8 new file mode 100644 index 00000000..52d98bc4 --- /dev/null +++ b/man/man8/tc-mirred.8 @@ -0,0 +1,89 @@ +.TH "Mirror/redirect action in tc" 8 "11 Jan 2015" "iproute2" "Linux" + +.SH NAME +mirred - mirror/redirect action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action mirred" +.I DIRECTION ACTION +.RB "[ " index +.IR INDEX " ] " +.BI dev " DEVICENAME" + +.ti -8 +.IR DIRECTION " := { " +.BR ingress " | " egress " }" + +.ti -8 +.IR ACTION " := { " +.BR mirror " | " redirect " }" +.SH DESCRIPTION +The +.B mirred +action allows to redirect or mirror packets to another network interface on the +same system. It is typically used in combination with the +.B ifb +pseudo device to create a shrared instance where QoS happens, but serves well +for debugging or monitoring purposes, too. +.SH OPTIONS +.TP +.B ingress +.TQ +.B egress +Specify the direction in which the packet shall appear on the destination +interface. Currently only +.B egress +is implemented. +.TP +.B mirror +.TQ +.B redirect +Define whether the packet should be copied +.RB ( mirror ) +or moved +.RB ( redirect ) +to the destination interface. +.TP +.BI index " INDEX" +Assign a unique ID to this action instead of letting the kernel choose one +automatically. +.I INDEX +is a 32bit unsigned integer greater than zero. +.TP +.BI dev " DEVICENAME" +Specify the network interface to redirect or mirror to. +.SH EXAMPLES +Limit ingress bandwidth on eth0 to 1mbit/s, redirect exceeding traffic to lo for +debugging purposes: + +.RS +.EX +# tc qdisc add dev eth0 handle ffff: ingress +# tc filter add dev eth0 parent ffff: u32 \\ + match u32 0 0 \\ + action police rate 1mbit burst 100k conform-exceed pipe \\ + action mirred egress redirect dev lo +.EE +.RE + +Use an +.B ifb +interface to send ingress traffic on eth0 through an instance of +.BR sfq : + +.RS +.EX +# modprobe ifb +# ip link set ifb0 up +# tc qdisc add dev ifb0 root sfq +# tc qdisc add dev eth0 handle ffff: ingress +# tc filter add dev eth0 parent ffff: u32 \\ + match u32 0 0 \\ + action mirred egress redirect dev ifb0 +.EE +.RE + +.SH SEE ALSO +.BR tc (8), +.BR tc-u32 (8) diff --git a/man/man8/tc-nat.8 b/man/man8/tc-nat.8 new file mode 100644 index 00000000..fdcc052a --- /dev/null +++ b/man/man8/tc-nat.8 @@ -0,0 +1,78 @@ +.TH "NAT action in tc" 8 "12 Jan 2015" "iproute2" "Linux" + +.SH NAME +nat - stateless native address translation action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action nat" +.I DIRECTION OLD NEW + +.ti -8 +.IR DIRECTION " := { " +.BR ingress " | " egress " }" + +.ti -8 +.IR OLD " := " IPV4_ADDR_SPEC + +.ti -8 +.IR NEW " := " IPV4_ADDR_SPEC + +.ti -8 +.IR IPV4_ADDR_SPEC " := { " +.BR default " | " any " | " all " | " +\fIin_addr\fR[\fB/\fR{\fIprefix\fR|\fInetmask\fR}] +.SH DESCRIPTION +The +.B nat +action allows to perform NAT without the overhead of conntrack, which is +desirable if the number of flows or addresses to perform NAT on is large. This +action is best used in combination with the +.B u32 +filter to allow for efficient lookups of a large number of stateless NAT rules +in constant time. +.SH OPTIONS +.TP +.B ingress +Translate destination addresses, i.e. perform DNAT. +.TP +.B egress +Translate source addresses, i.e. perform SNAT. +.TP +.I OLD +Specifies addresses which should be translated. +.TP +.I NEW +Specifies addresses which +.I OLD +should be translated into. +.SH NOTES +The accepted address format in +.IR OLD " and " NEW +is quite flexible. It may either consist of one of the keywords +.BR default ", " any " or " all , +representing the all-zero IP address or a combination of IP address and netmask +or prefix length separated by a slash +.RB ( / ) +sign. In any case, the mask (or prefix length) value of +.I OLD +is used for +.I NEW +as well so that a one-to-one mapping of addresses is assured. + +Address translation is done using a combination of binary operations. First, the +original (source or destination) address is matched against the value of +.IR OLD . +If the original address fits, the new address is created by taking the leading +bits from +.I NEW +(defined by the netmask of +.IR OLD ) +and taking the remaining bits from the original address. + +There is rudimental support for upper layer protocols, namely TCP, UDP and ICMP. +While for the first two only checksum recalculation is performed, the action +also takes care of embedded IP headers in ICMP packets by translating the +respective address therein, too. +.SH SEE ALSO +.BR tc (8) diff --git a/man/man8/tc-pedit.8 b/man/man8/tc-pedit.8 new file mode 100644 index 00000000..c30927ec --- /dev/null +++ b/man/man8/tc-pedit.8 @@ -0,0 +1,230 @@ +.TH "Generic packet editor action in tc" 8 "12 Jan 2015" "iproute2" "Linux" + +.SH NAME +pedit - generic packet editor action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action pedit munge " { +.IR RAW_OP " | " LAYERED_OP " } [ " BRANCH " ]" + +.ti -8 +.IR RAW_OP " := " +.BI offset " OFFSET" +.RB "{ " u8 " | " u16 " | " u32 " } [" +.IR AT_SPEC " ] " CMD_SPEC + +.ti -8 +.IR AT_SPEC " := " +.BI at " AT " offmask " MASK " shift " SHIFT" + +.ti -8 +.IR LAYERED_OP " := { " +.BI ip " IPHDR_FIELD" +| +.BI ip6 " IP6HDR_FIELD" +| +.BI udp " UDPHDR_FIELD" +| +.BI tcp " TCPHDR_FIELD" +| +.BI icmp " ICMPHDR_FIELD" +.RI } " CMD_SPEC" + +.ti -8 +.IR IPHDR_FIELD " := { " +.BR src " | " dst " | " tos " | " dsfield " | " ihl " | " protocol " |" +.BR precedence " | " nofrag " | " firstfrag " | " ce " | " df " |" +.BR mf " | " dport " | " sport " | " icmp_type " | " icmp_code " }" + +.ti -8 +.IR CMD_SPEC " := {" +.BR clear " | " invert " | " set +.IR VAL " | " +.BR preserve " } [ " retain +.IR RVAL " ]" + +.ti -8 +.IR BRANCH " := {" +.BR reclassify " | " pipe " | " drop " | " shot " | " continue " | " pass " }" +.SH DESCRIPTION +The +.B pedit +action can be used to change arbitrary packet data. The location of data to +change can either be specified by giving an offset and size as in +.IR RAW_OP , +or for header values by naming the header and field to edit the size is then +chosen automatically based on the header field size. Currently this is supported +only for IPv4 headers. +.SH OPTIONS +.TP +.BI offset " OFFSET " "\fR{ \fBu32 \fR| \fBu16 \fR| \fBu8 \fR}" +Specify the offset at which to change data. +.I OFFSET +is a signed integer, it's base is automatically chosen (e.g. hex if prefixed by +.B 0x +or octal if prefixed by +.BR 0 ). +The second argument specifies the length of data to change, that is four bytes +.RB ( u32 ), +two bytes +.RB ( u16 ) +or a single byte +.RB ( u8 ). +.TP +.BI at " AT " offmask " MASK " shift " SHIFT" +This is an optional part of +.IR RAW_OP +which allows to have a variable +.I OFFSET +depending on packet data at offset +.IR AT , +which is binary ANDed with +.I MASK +and right-shifted by +.I SHIFT +before adding it to +.IR OFFSET . +.TP +.BI ip " IPHDR_FIELD" +Change an IPv4 header field. The supported keywords for +.I IPHDR_FIELD +are: +.RS +.TP +.B src +.TQ +.B dst +Source or destination IP address, a four-byte value. +.TP +.B tos +.TQ +.B dsfield +.TQ +.B precedence +Type Of Service field, an eight-bit value. +.TP +.B ihl +Change the IP Header Length field, a four-bit value. +.TP +.B protocol +Next-layer Protocol field, an eight-bit value. +.TP +.B nofrag +.TQ +.B firstfrag +.TQ +.B ce +.TQ +.B df +.TQ +.B mf +Change IP header flags. Note that the value to pass to the +.B set +command is not just a bit value, but the full byte including the flags field. +Though only the relevant bits of that value are respected, the rest ignored. +.TP +.B dport +.TQ +.B sport +Destination or source port numbers, a 16-bit value. Indeed, IPv4 headers don't +contain this information. Instead, this will set an offset which suits at least +TCP and UDP if the IP header is of minimum size (20 bytes). If not, this will do +unexpected things. +.TP +.B icmp_type +.TQ +.B icmp_code +Again, this allows to change data past the actual IP header itself. It assumes +an ICMP header is present immediately following the (minimal sized) IP header. +If it is not or the latter is bigger than the minimum of 20 bytes, this will do +unexpected things. These fields are eight-bit values. +.RE +.TP +.B clear +Clear the addressed data (i.e., set it to zero). +.TP +.B invert +Swap every bit in the addressed data. +.TP +.BI set " VAL" +Set the addressed data to a specific value. The size of +.I VAL +is defined by either one of the +.BR u32 ", " u16 " or " u8 +keywords in +.IR RAW_OP , +or the size of the addressed header field in +.IR LAYERED_OP . +.TP +.B preserve +Keep the addressed data as is. +.TP +.BI retain " RVAL" +This optional extra part of +.I CMD_SPEC +allows to exclude bits from being changed. +.TP +.I BRANCH +The following keywords allow to control how the tree of qdisc, classes, +filters and actions is further traversed after this action. +.RS +.TP +.B reclassify +Restart with the first filter in the current list. +.TP +.B pipe +Continue with the next action attached to the same filter. +.TP +.B drop +.TQ +.B shot +Drop the packet. +.TP +.B continue +Continue classification with the next filter in line. +.TP +.B pass +Finish classification process and return to calling qdisc for further packet +processing. This is the default. +.RE +.SH EXAMPLES +Being able to edit packet data, one could do all kinds of things, such as e.g. +implementing port redirection. Certainly not the most useful application, but +as an example it should do: + +First, qdiscs need to be set up to attach filters to. For the receive path, a simple +.B ingress +qdisc will do, for transmit path a classful qdisc +.RB ( HTB +in this case) is necessary: + +.RS +.EX +tc qdisc replace dev eth0 root handle 1: htb +tc qdisc add dev eth0 ingress handle ffff: +.EE +.RE + +Finally, a filter with +.B pedit +action can be added for each direction. In this case, +.B u32 +is used matching on the port number to redirect from, while +.B pedit +then does the actual rewriting: + +.RS +.EX +tc filter add dev eth0 parent 1: u32 \\ + match ip dport 23 0xffff \\ + action pedit pedit munge ip dport set 22 +tc filter add dev eth0 parent ffff: u32 \\ + match ip sport 22 0xffff \\ + action pedit pedit munge ip sport set 23 +.EE +.RE +.SH SEE ALSO +.BR tc (8), +.BR tc-htb (8), +.BR tc-u32 (8) diff --git a/man/man8/tc-police.8 b/man/man8/tc-police.8 new file mode 100644 index 00000000..2b1537ec --- /dev/null +++ b/man/man8/tc-police.8 @@ -0,0 +1,127 @@ +.TH "Policing action in tc" 8 "20 Jan 2015" "iproute2" "Linux" + +.SH NAME +police - policing action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action police" +.BI rate " RATE " burst +.IR BYTES [\fB/ BYTES "] [" +.B mtu +.IR BYTES [\fB/ BYTES "] ] [" +.BI peakrate " RATE" +] [ +.BI avrate " RATE" +] [ +.BI overhead " BYTES" +] [ +.BI linklayer " TYPE" +] [ +.BI conform-exceed " EXCEEDACT\fR[\fB/\fIEXCEEDACT\fR]" + +.ti -8 +.IR EXCEEDACT " := { " +.BR pipe " | " ok " | " reclassify " | " drop " | " continue " }" +.SH DESCRIPTION +The +.B police +action allows to limit bandwidth of traffic matched by the filter it is +attached to. +.SH OPTIONS +.TP +.BI rate " RATE" +The maximum traffic rate of packets passing this action. Those exceeding it will +be treated as defined by the +.B conform-exceed +option. +.TP +.BI burst " BYTES\fR[\fB/\fIBYTES\fR]" +Set the maximum allowed burst in bytes, optionally followed by a slash ('/') +sign and cell size which must be a power of 2. +.TP +.BI mtu " BYTES\fR[\fB/\fIBYTES\fR]" +This is the maximum packet size handled by the policer (larger ones will be +handled like they exceeded the configured rate). Setting this value correctly +will improve the scheduler's precision. +Value formatting is identical to +.B burst +above. Defaults to unlimited. +.TP +.BI peakrate " RATE" +Set the maximum bucket depletion rate, exceeding +.BR rate . +.TP +.BI avrate " RATE" +Make use of an in-kernel bandwidth rate estimator and match the given +.I RATE +against it. +.TP +.BI overhead " BYTES" +Account for protocol overhead of encapsulating output devices when computing +.BR rate " and " peakrate . +.TP +.BI linklayer " TYPE" +Specify the link layer type. +.I TYPE +may be one of +.B ethernet +(the default), +.BR atm " or " adsl +(which are synonyms). It is used to align the precomputed rate tables to ATM +cell sizes, for +.B ethernet +no action is taken. +.TP +.BI conform-exceed " EXCEEDACT\fR[\fB/\fIEXCEEDACT\fR]" +Define how to handle packets which exceed (and, if the second +.I EXCEEDACT +is given, also those who don't), the configured bandwidth limit. Possible values +are: +.RS +.IP continue +Don't do anything, just continue with the next action in line. +.IP drop +Drop the packet immediately. +.IP shot +This is a synonym to +.BR drop . +.IP ok +Accept the packet. This is the default for conforming packets. +.IP pass +This is a synonym to +.BR ok . +.IP reclassify +Treat the packet as non-matching to the filter this action is attached to and +continue with the next filter in line (if any). This is the default for +exceeding packets. +.IP pipe +Pass the packet to the next action in line. +.SH EXAMPLES +A typical application of the police action is to enforce ingress traffic rate +by dropping exceeding packets. Although better done on the sender's side, +especially in scenarios with lack of peer control (e.g. with dial-up providers) +this is often the best one can do in order to keep latencies low under high +load. The following establishes input bandwidth policing to 1mbit/s using the +.B ingress +qdisc and +.B u32 +filter: + +.RS +.EX +# tc qdisc add dev eth0 handle ffff: ingress +# tc filter add dev eth0 parent ffff: u32 \\ + match u32 0 0 \\ + police rate 1mbit burst 100k +.EE +.RE + +As an action can not live on it's own, there always has to be a filter involved as link between qdisc and action. The example above uses +.B u32 +for that, which is configured to effectively match any packet (passing it to the +.B police +action thereby). + +.SH SEE ALSO +.BR tc (8) diff --git a/man/man8/tc-simple.8 b/man/man8/tc-simple.8 new file mode 100644 index 00000000..2206dc3b --- /dev/null +++ b/man/man8/tc-simple.8 @@ -0,0 +1,76 @@ +.TH "Simple action in tc" 8 "12 Jan 2015" "iproute2" "Linux" + +.SH NAME +simple - basic example action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action simple" +.I STRING +.SH DESCRIPTION +This is a pedagogical example rather than an actually useful action. Upon every access, it prints the given +.I STRING +which may be of arbitrary length. +.SH OPTIONS +.TP +.I STRING +The actual string to print. +.SH EXAMPLES +The following example makes the kernel yell "Incoming ICMP!" every time it sees +an incoming ICMP on eth0. Steps are: +.IP 1) 4 +Add an ingress qdisc point to eth0 +.IP 2) 4 +Start a chain on ingress of eth0 that first matches ICMP then invokes the +simple action to shout. +.IP 3) 4 +display stats and show that no packet has been seen by the action +.IP 4) 4 +Send one ping packet to google (expect to receive a response back) +.IP 5) 4 +grep the logs to see the logged message +.IP 6) 4 +display stats again and observe increment by 1 + +.RE +.EX + hadi@noma1:$ tc qdisc add dev eth0 ingress + hadi@noma1:$tc filter add dev eth0 parent ffff: protocol ip prio 5 \\ + u32 match ip protocol 1 0xff flowid 1:1 action simple "Incoming ICMP" + + hadi@noma1:$ sudo tc -s filter ls dev eth0 parent ffff: + filter protocol ip pref 5 u32 + filter protocol ip pref 5 u32 fh 800: ht divisor 1 + filter protocol ip pref 5 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 + match 00010000/00ff0000 at 8 + action order 1: Simple + index 4 ref 1 bind 1 installed 29 sec used 29 sec + Action statistics: + Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) + backlog 0b 0p requeues 0 + + + hadi@noma1$ ping -c 1 www.google.ca + PING www.google.ca (74.125.225.120) 56(84) bytes of data. + 64 bytes from ord08s08-in-f24.1e100.net (74.125.225.120): icmp_req=1 ttl=53 time=31.3 ms + + --- www.google.ca ping statistics --- + 1 packets transmitted, 1 received, 0% packet loss, time 0ms + rtt min/avg/max/mdev = 31.316/31.316/31.316/0.000 ms + + hadi@noma1$ dmesg | grep simple + [135354.473951] simple: Incoming ICMP_1 + + hadi@noma1$ sudo tc/tc -s filter ls dev eth0 parent ffff: + filter protocol ip pref 5 u32 + filter protocol ip pref 5 u32 fh 800: ht divisor 1 + filter protocol ip pref 5 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 + match 00010000/00ff0000 at 8 + action order 1: Simple + index 4 ref 1 bind 1 installed 206 sec used 67 sec + Action statistics: + Sent 84 bytes 1 pkt (dropped 0, overlimits 0 requeues 0) + backlog 0b 0p requeues 0 +.EE +.SH SEE ALSO +.BR tc (8) diff --git a/man/man8/tc-skbedit.8 b/man/man8/tc-skbedit.8 new file mode 100644 index 00000000..b585a4d4 --- /dev/null +++ b/man/man8/tc-skbedit.8 @@ -0,0 +1,45 @@ +.TH "SKB editing action in tc" 8 "12 Jan 2015" "iproute2" "Linux" + +.SH NAME +skbedit - SKB editing action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action skbedit " [ " queue_mapping +.IR QUEUE_MAPPING " ] [" +.B priority +.IR PRIORITY " ] [" +.B mark +.IR MARK " ]" +.SH DESCRIPTION +The +.B skbedit +action allows to change a packet's associated meta data. It complements the +.B pedit +action, which in turn allows to change parts of the packet data itself. +.SH OPTIONS +.TP +.BI queue_mapping " QUEUE_MAPPING" +Override the packet's transmit queue. Useful when applied to packets transmitted +over MQ-capable network interfaces. +.I QUEUE_MAPPING +is an unsigned 16bit value in decimal format. +.TP +.BI priority " PRIORITY" +Override the packet classification decision. +.I PRIORITY +is either +.BR root ", " none +or a hexadecimal major class ID optionally followed by a colon +.RB ( : ) +and a hexadecimal minor class ID. +.TP +.BI mark " MARK" +Change the packet's firewall mark value. +.I MARK +is an unsigned 32bit value in automatically detected format (i.e., prefix with +.RB ' 0x ' +for hexadecimal interpretation, etc.). +.SH SEE ALSO +.BR tc (8), +.BR tc-pedit (8) diff --git a/man/man8/tc-u32.8 b/man/man8/tc-u32.8 index 47c8f2d0..691f53c1 100644 --- a/man/man8/tc-u32.8 +++ b/man/man8/tc-u32.8 @@ -370,6 +370,7 @@ then allows to match various header fields: .RS .TP .BI src " ADDR" +.TQ .BI dst " ADDR" Compare Source or Destination Address fields against the value of .IR ADDR . diff --git a/man/man8/tc-vlan.8 b/man/man8/tc-vlan.8 new file mode 100644 index 00000000..e650b72d --- /dev/null +++ b/man/man8/tc-vlan.8 @@ -0,0 +1,54 @@ +.TH "VLAN manipulation action in tc" 8 "12 Jan 2015" "iproute2" "Linux" + +.SH NAME +vlan - vlan manipulation module +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action vlan" " { " pop " |" +.IR PUSH " }" + +.ti -8 +.IR PUSH " := " +.BR push " [ " protocol +.IR VLANPROTO " ]" +.BI id " VLANID" +.SH DESCRIPTION +The +.B vlan +action allows to perform 802.1Q en- or decapsulation on a packet, reflected by +the two operation modes +.IR POP " and " PUSH . +The +.I POP +mode is simple, as no further information is required to just drop the +outer-most VLAN encapsulation. The +.I PUSH +mode on the other hand requires at least a +.I VLANID +and allows to optionally choose the +.I VLANPROTO +to use. +.SH OPTIONS +.TP +.B pop +Decapsulation mode, no further arguments allowed. +.TP +.B push +Encapsulation mode. Requires at least +.B id +option. +.TP +.BI id " VLANID" +Specify the VLAN ID to encapsulate into. +.I VLANID +is an unsigned 16bit integer, the format is detected automatically (e.g. prefix +with +.RB ' 0x ' +for hexadecimal interpretation, etc.). +.TP +.BI protocol " VLANPROTO" +Choose the VLAN protocol to use. At the time of writing, the kernel accepts only +.BR 802.1Q " or " 802.1ad . +.SH SEE ALSO +.BR tc (8) diff --git a/man/man8/tc-xt.8 b/man/man8/tc-xt.8 new file mode 100644 index 00000000..4fd800cf --- /dev/null +++ b/man/man8/tc-xt.8 @@ -0,0 +1,42 @@ +.TH "iptables action in tc" 8 "3 Mar 2016" "iproute2" "Linux" + +.SH NAME +xt - tc iptables action +.SH SYNOPSIS +.in +8 +.ti -8 +.BR tc " ... " "action xt \-j" +.IR TARGET " [ " TARGET_OPTS " ]" +.SH DESCRIPTION +The +.B xt +action allows to call arbitrary iptables targets for packets matching the filter +this action is attached to. +.SH OPTIONS +.TP +.BI -j " TARGET \fR[\fI TARGET_OPTS \fR]" +Perform a jump to the given iptables target, optionally passing any target +specific options in +.IR TARGET_OPTS . +.SH EXAMPLES +The following will attach a +.B u32 +filter to the +.B ingress +qdisc matching ICMP replies and using the +.B xt +action to make the kernel yell 'PONG' each time: + +.RS +.EX +tc qdisc add dev eth0 ingress +tc filter add dev eth0 parent ffff: proto ip u32 \\ + match ip protocol 1 0xff \\ + match ip icmp_type 0 0xff \\ + action xt -j LOG --log-prefix PONG +.EE +.RE +.SH SEE ALSO +.BR tc (8), +.BR tc-u32 (8), +.BR iptables-extensions (8) diff --git a/tc/m_pedit.c b/tc/m_pedit.c index 4fdd189d..86eb0ca3 100644 --- a/tc/m_pedit.c +++ b/tc/m_pedit.c @@ -35,7 +35,7 @@ static int pedit_debug; static void explain(void) { - fprintf(stderr, "Usage: ... pedit munge \n"); + fprintf(stderr, "Usage: ... pedit munge []\n"); fprintf(stderr, "Where: MUNGE := |\n" "\t:= [ATC]\n " @@ -47,6 +47,7 @@ explain(void) "\t\tCMD:= clear | invert | set | retain\n " "\t:= ip | ip6 \n " " \t\t| udp | tcp | icmp \n" + "\t:= reclassify | pipe | drop | continue | pass\n" "For Example usage look at the examples directory\n"); }